#62319 - Decommission XSLF-/PowerPointExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2018-04-20 12:52:59 +00:00
parent bc436fcc3d
commit ab390ce170
27 changed files with 824 additions and 1248 deletions

View File

@ -330,8 +330,6 @@ public class TestAllFiles {
); );
private static final Set<String> IGNORED = unmodifiableHashSet( private static final Set<String> IGNORED = unmodifiableHashSet(
// need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
"slideshow/42474-2.ppt",
// OPC handler works / XSSF handler fails // OPC handler works / XSSF handler fails
"spreadsheet/57181.xlsm", "spreadsheet/57181.xlsm",
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop "spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop

View File

@ -24,6 +24,7 @@ import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
// additionally try the other getText() methods // additionally try the other getText() methods
try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) { try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor); assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
assertNotNull(extractor.getText(true, true, true)); assertNotNull(extractor.getText());
assertEquals("With all options disabled we should not get text",
"", extractor.getText(false, false, false)); extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
assertEquals("With all options disabled we should not get text", "", extractor.getText());
} }
} }

View File

@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* *
* @return the underlying POIDocument * @return the underlying POIDocument
*/ */
@Override
public POIDocument getDocument() { public POIDocument getDocument() {
return document; return document;
} }

View File

@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
fsToClose.close(); fsToClose.close();
} }
} }
/**
* @return the processed document
*/
public abstract Object getDocument();
} }

View File

@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
return threadPreferEventExtractors.get(); return threadPreferEventExtractors.get();
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS return (T)createExtractor(fs.getRoot());
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException { public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS return (T)createExtractor(fs.getRoot());
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException { public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS return (T)createExtractor(fs.getRoot());
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POITextExtractor createExtractor(InputStream input) throws IOException { public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass(); Class<?> cls = getOOXMLClass();
if (cls != null) { if (cls != null) {
// Use Reflection to get us the full OOXML-enabled version // Use Reflection to get us the full OOXML-enabled version
try { try {
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class); Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
return (POITextExtractor)m.invoke(null, input); return (T)m.invoke(null, input);
} catch (IllegalArgumentException iae) { } catch (IllegalArgumentException iae) {
throw iae; throw iae;
} catch (Exception e) { } catch (Exception e) {

View File

@ -45,7 +45,29 @@ public class DocumentFactoryHelper {
*/ */
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password) public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
throws IOException { throws IOException {
EncryptionInfo info = new EncryptionInfo(fs); // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
// as well when the resulting OPCPackage is closed
return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
}
/**
* Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
* the given password.
*
* @param root The OLE2 directory node for the document
* @param password The password, null if the default password should be used
* @return A stream for reading the decrypted data
* @throws IOException If an error occurs while decrypting or if the password does not match
*/
public static InputStream getDecryptedStream(final DirectoryNode root, String password)
throws IOException {
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info); Decryptor d = Decryptor.getInstance(info);
try { try {
@ -58,20 +80,10 @@ public class DocumentFactoryHelper {
} }
if (passwordCorrect) { if (passwordCorrect) {
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem return d.getDataStream(root);
// as well when the resulting OPCPackage is closed } else if (password != null) {
return new FilterInputStream(d.getDataStream(fs.getRoot())) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
} else {
if (password != null)
throw new EncryptedDocumentException("Password incorrect"); throw new EncryptedDocumentException("Password incorrect");
else } else {
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied"); throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
} }
} catch (GeneralSecurityException e) { } catch (GeneralSecurityException e) {

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.sl.extractor; package org.apache.poi.sl.extractor;
import java.util.ArrayList; import java.util.ArrayList;
@ -48,6 +65,16 @@ public class SlideShowExtractor<
this.slideshow = slideshow; this.slideshow = slideshow;
} }
/**
* Returns opened document
*
* @return the opened document
*/
@Override
public final Object getDocument() {
return slideshow.getPersistDocument();
}
/** /**
* Should a call to getText() return slide text? Default is yes * Should a call to getText() return slide text? Default is yes
*/ */
@ -219,7 +246,6 @@ public class SlideShowExtractor<
return; return;
} }
for (final P para : paraList) { for (final P para : paraList) {
final int oldLen = sb.length();
for (final TextRun tr : para) { for (final TextRun tr : para) {
final String str = tr.getRawText().replace("\r", ""); final String str = tr.getRawText().replace("\r", "");
final String newStr; final String newStr;

View File

@ -126,4 +126,13 @@ public interface SlideShow<
* @since POI 4.0.0 * @since POI 4.0.0
*/ */
POITextExtractor getMetadataTextExtractor(); POITextExtractor getMetadataTextExtractor();
/**
* @return the instance which handles the persisting of the slideshow,
* which is either a subclass of {@link org.apache.poi.POIDocument}
* or {@link org.apache.poi.POIXMLDocument}
*
* @since POI 4.0.0
*/
Object getPersistDocument();
} }

View File

@ -60,13 +60,40 @@ public class SlideShowFactory {
* @throws IOException if an error occurs while reading the data * @throws IOException if an error occurs while reading the data
*/ */
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException { public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
DirectoryNode root = fs.getRoot(); return create(fs.getRoot(), password);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem.
*
* @param root The {@link DirectoryNode} to start reading the document from
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
return create(root, null);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem, which may
* be password protected
*
* @param root The {@link DirectoryNode} to start reading the document from
* @param password The password that should be used or null if no password is necessary.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
// Encrypted OOXML files go inside OLE2 containers, is this one? // Encrypted OOXML files go inside OLE2 containers, is this one?
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
InputStream stream = null; InputStream stream = null;
try { try {
stream = DocumentFactoryHelper.getDecryptedStream(fs, password); stream = DocumentFactoryHelper.getDecryptedStream(root, password);
return createXSLFSlideShow(stream); return createXSLFSlideShow(stream);
} finally { } finally {
@ -82,7 +109,7 @@ public class SlideShowFactory {
passwordSet = true; passwordSet = true;
} }
try { try {
return createHSLFSlideShow(fs); return createHSLFSlideShow(root);
} finally { } finally {
if (passwordSet) { if (passwordSet) {
Biff8EncryptionKey.setCurrentUserPassword(null); Biff8EncryptionKey.setCurrentUserPassword(null);

View File

@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
* *
* @return the opened document * @return the opened document
*/ */
@Override
public final POIXMLDocument getDocument() { public final POIXMLDocument getDocument() {
return _document; return _document;
} }

View File

@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.IOUtils; import org.apache.poi.util.IOUtils;
import org.apache.poi.util.NotImplemented; import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
import org.apache.poi.util.Removal; import org.apache.poi.util.Removal;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@ -127,20 +129,20 @@ public class ExtractorFactory {
return OLE2ExtractorFactory.getPreferEventExtractor(); return OLE2ExtractorFactory.getPreferEventExtractor();
} }
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException { public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null; NPOIFSFileSystem fs = null;
try { try {
fs = new NPOIFSFileSystem(f); fs = new NPOIFSFileSystem(f);
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
return createEncryptedOOXMLExtractor(fs); return (T)createEncryptedOOXMLExtractor(fs);
} }
POIOLE2TextExtractor extractor = createExtractor(fs); POITextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs); extractor.setFilesystem(fs);
return extractor; return (T)extractor;
} catch (OfficeXmlFileException e) { } catch (OfficeXmlFileException e) {
// ensure file-handle release // ensure file-handle release
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) { } catch (NotOLE2FileException ne) {
// ensure file-handle release // ensure file-handle release
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
@ -179,7 +181,7 @@ public class ExtractorFactory {
* @throws XmlException If an XML parsing error occurs. * @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found. * @throws IllegalArgumentException If no matching file type could be found.
*/ */
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try { try {
// Check for the normal Office core document // Check for the normal Office core document
PackageRelationshipCollection core; PackageRelationshipCollection core;
@ -226,13 +228,13 @@ public class ExtractorFactory {
// Is it XSLF? // Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) { if ( rel.getContentType().equals( contentType ) ) {
return new XSLFPowerPointExtractor(pkg); return new SlideShowExtractor(new XMLSlideShow(pkg));
} }
} }
// special handling for SlideShow-Theme-files, // special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); return new SlideShowExtractor(new XMLSlideShow(pkg));
} }
// How about xlsb? // How about xlsb?
@ -252,28 +254,28 @@ public class ExtractorFactory {
} }
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return createExtractor(fs.getRoot());
} }
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{ {
// First, check for OOXML // First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) { for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) { if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg); return (T)createExtractor(pkg);
} }
} }
// If not, ask the OLE2 code to check, with Scratchpad if possible // If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir); return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
} }
/** /**
@ -403,7 +405,7 @@ public class ExtractorFactory {
throw new IllegalStateException("Not yet supported"); throw new IllegalStateException("Not yet supported");
} }
private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs) private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
throws IOException { throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword(); String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) { if (pass == null) {

View File

@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
* @deprecated use {@link SlideShowExtractor} * @deprecated use {@link SlideShowExtractor}
*/ */
@Deprecated @Deprecated
@Removal(version="4.2.0") @Removal(version="5.0.0")
public class XSLFPowerPointExtractor extends POIXMLTextExtractor { public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{ public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,

View File

@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return new POIXMLPropertiesTextExtractor(this); return new POIXMLPropertiesTextExtractor(this);
} }
@Override
public Object getPersistDocument() {
return this;
}
} }

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel; package org.apache.poi.xslf.usermodel;
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS; import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;

View File

@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
*/ */
public XSLFCommentAuthors getCommentAuthorsPart() { public XSLFCommentAuthors getCommentAuthorsPart() {
if(_commentAuthors == null) { if(_commentAuthors == null) {
// first scan the slide relations
for (POIXMLDocumentPart p : getRelations()) { for (POIXMLDocumentPart p : getRelations()) {
if (p instanceof XSLFCommentAuthors) { if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p; _commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors; return _commentAuthors;
} }
} }
// then scan the presentation relations
for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
} }
return null; return null;

View File

@ -27,16 +27,15 @@ import static org.junit.Assert.fail;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Locale;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLException;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.UnsupportedFileFormatException; import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.OldExcelFormatException; import org.apache.poi.hssf.OldExcelFormatException;
@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger; import org.apache.poi.util.POILogger;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.junit.BeforeClass; import org.apache.xmlbeans.XmlException;
import org.junit.Test; import org.junit.Test;
/** /**
@ -65,34 +66,39 @@ public class TestExtractorFactory {
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class); private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
private static File txt; private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
private static File xls; private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
private static File xlsx; private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
private static File xlsxStrict; private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
private static File xltx; private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
private static File xlsEmb; private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
private static File xlsb; private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
private static File doc; private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
private static File doc6; private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
private static File doc95; private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
private static File docx; private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
private static File dotx;
private static File docEmb;
private static File docEmbOOXML;
private static File ppt; private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
private static File pptx; private static final File msg = getFileAndCheck(olTests, "quick.msg");
private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
private static File msg; private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
private static File msgEmb; private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
private static File msgEmbMsg; private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
private static File vsd; private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
private static File vsdx; private static File pub = getFileAndCheck(pubTests, "Simple.pub");
private static File pub;
private static File getFileAndCheck(POIDataSamples samples, String name) { private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name); File file = samples.getFile(name);
@ -104,595 +110,133 @@ public class TestExtractorFactory {
return file; return file;
} }
@BeforeClass private static final Object[] TEST_SET = {
public static void setUp() throws Exception { "Excel", xls, ExcelExtractor.class, 200,
"Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
"Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
"Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
"Word", doc, WordExtractor.class, 120,
"Word - docx", docx, XWPFWordExtractor.class, 120,
"Word - dotx", dotx, XWPFWordExtractor.class, -1,
"Word 6", doc6, Word6Extractor.class, 20,
"Word 95", doc95, Word6Extractor.class, 120,
"PowerPoint", ppt, SlideShowExtractor.class, 120,
"PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
"Visio", vsd, VisioTextExtractor.class, 50,
"Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
"Publisher", pub, PublisherTextExtractor.class, 50,
"Outlook msg", msg, OutlookTextExtactor.class, 50,
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); // TODO Support OOXML-Strict, see bug #57699
xls = getFileAndCheck(ssTests, "SampleSS.xls"); // xlsxStrict
xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); };
xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
xltx = getFileAndCheck(ssTests, "test.xltx");
xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); @FunctionalInterface
doc = getFileAndCheck(wpTests, "SampleDoc.doc"); interface FunctionEx<T, R> {
doc6 = getFileAndCheck(wpTests, "Word6.doc"); R apply(T t) throws IOException, OpenXML4JException, XmlException;
doc95 = getFileAndCheck(wpTests, "Word95.doc");
docx = getFileAndCheck(wpTests, "SampleDoc.docx");
dotx = getFileAndCheck(wpTests, "test.dotx");
docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ppt = getFileAndCheck(slTests, "SampleShow.ppt");
pptx = getFileAndCheck(slTests, "SampleShow.pptx");
txt = getFileAndCheck(slTests, "SampleShow.txt");
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
vsdx = getFileAndCheck(dgTests, "test.vsdx");
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
pub = getFileAndCheck(pubTests, "Simple.pub");
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = getFileAndCheck(olTests, "quick.msg");
msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
} }
@Test @Test
public void testFile() throws Exception { public void testFile() throws Exception {
// Excel for (int i = 0; i < TEST_SET.length; i += 4) {
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls); try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
assertNotNull("Had empty extractor for " + xls, xlsExtractor); testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), }
xlsExtractor }
instanceof ExcelExtractor
);
assertTrue(
xlsExtractor.getText().length() > 200
);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
} }
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(
extractor
instanceof WordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
extractor instanceof XWPFWordExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(
extractor instanceof XWPFWordExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(
extractor
instanceof PowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(
extractor
instanceof VisioTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(
extractor
instanceof PublisherTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(
extractor
instanceof OutlookTextExtactor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
@Test(expected = IllegalArgumentException.class)
public void testFileInvalid() throws Exception {
// Text // Text
try { try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
ExtractorFactory.createExtractor(txt);
fail("expected IllegalArgumentException");
} catch(IllegalArgumentException e) {
// Good
}
} }
@Test @Test
public void testInputStream() throws Exception { public void testInputStream() throws Exception {
// Excel testStream((f) -> ExtractorFactory.createExtractor(f), true);
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(
extractor
instanceof ExcelExtractor
);
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
assertTrue(
extractor.getText().length() > 200
);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof WordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(
extractor
instanceof XWPFWordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(
extractor
instanceof PowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(
extractor
instanceof VisioTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(
extractor
instanceof PublisherTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(
extractor
instanceof OutlookTextExtactor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Text
try (FileInputStream stream = new FileInputStream(txt)) {
ExtractorFactory.createExtractor(stream);
fail("expected IllegalArgumentException");
} catch(IllegalArgumentException e) {
// Good
} }
@Test(expected = IllegalArgumentException.class)
public void testInputStreamInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(f));
} }
@Test @Test
public void testPOIFS() throws Exception { public void testPOIFS() throws Exception {
// Excel testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
// Word
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
instanceof WordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
);
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
);
// Visio
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
fail("expected IllegalArgumentException");
} catch(IOException e) {
// Good
}
} }
@Test(expected = IOException.class)
public void testPOIFSInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
}
@Test @Test
public void testOPOIFS() throws Exception { public void testOPOIFS() throws Exception {
// Excel testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
assertTrue( }
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
// Word @Test(expected = IOException.class)
assertTrue( public void testOPOIFSInvalid() throws Exception {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
instanceof WordExtractor }
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
);
assertTrue( private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) throws IOException, OpenXML4JException, XmlException {
instanceof Word6Extractor for (int i = 0; i < TEST_SET.length; i += 4) {
); File testFile = (File) TEST_SET[i + 1];
assertTrue( if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 continue;
); }
try (FileInputStream fis = new FileInputStream(testFile);
POITextExtractor ext = poifsIS.apply(fis)) {
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
} catch (IllegalArgumentException e) {
fail("failed to process "+testFile);
}
}
}
// PowerPoint private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
assertTrue( assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) final String actual = ext.getText();
instanceof PowerPointExtractor if (minLength == -1) {
); assertContains(actual.toLowerCase(Locale.ROOT), "test");
assertTrue( } else {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
); }
}
// Visio
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
// Text // Text
try { try (FileInputStream fis = new FileInputStream(txt);
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt))); POITextExtractor te = poifs.apply(fis)) {
fail("expected IllegalArgumentException");
} catch(IOException e) {
// Good
} }
} }
@Test @Test
public void testPackage() throws Exception { public void testPackage() throws Exception {
// Excel for (int i = 0; i < TEST_SET.length; i += 4) {
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); final File testFile = (File) TEST_SET[i + 1];
assertTrue(extractor instanceof XSSFExcelExtractor); if (!testFile.getName().endsWith("x")) {
extractor.close(); continue;
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
fail("TestExtractorFactory.testPackage() failed on " + txt);
} catch(UnsupportedFileFormatException e) {
// Good
} catch (Exception e) {
LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
throw e;
} }
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
pkg.revert();
}
}
}
@Test(expected = UnsupportedFileFormatException.class)
public void testPackageInvalid() throws Exception {
// Text
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
} }
@Test @Test
@ -781,142 +325,49 @@ public class TestExtractorFactory {
* does poifs embedded, but will do ooxml ones * does poifs embedded, but will do ooxml ones
* at some point. * at some point.
*/ */
@SuppressWarnings("deprecation")
@Test @Test
public void testEmbedded() throws Exception { public void testEmbedded() throws Exception {
POIOLE2TextExtractor ext; final Object[] testObj = {
POITextExtractor[] embeds; "No embeddings", xls, "0-0-0-0-0-0",
"Excel", xlsEmb, "6-2-2-2-0-0",
"Word", docEmb, "4-1-2-1-0-0",
"Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
"Outlook", msgEmb, "1-1-0-0-0-0",
"Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
};
// No embeddings for (int i=0; i<testObj.length; i+=3) {
ext = (POIOLE2TextExtractor) try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
ExtractorFactory.createExtractor(xls); final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// No embeddings int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertNotNull(embeds);
ext.close();
// Excel
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for (POITextExtractor embed : embeds) { for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20); assertTrue(embed.getText().length() > 20);
if (embed instanceof SlideShowExtractor) {
if (embed instanceof PowerPointExtractor) numPpt++; numPpt++;
else if (embed instanceof ExcelExtractor) numXls++; } else if (embed instanceof ExcelExtractor) {
else if (embed instanceof WordExtractor) numWord++; numXls++;
else if (embed instanceof OutlookTextExtactor) numMsg++; } else if (embed instanceof WordExtractor) {
numWord++;
} else if (embed instanceof OutlookTextExtactor) {
numMsg++;
} else if (embed instanceof XWPFWordExtractor) {
numWordX++;
} }
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
} }
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
ext = (POIOLE2TextExtractor) final String expected = (String)testObj[i+2];
ExtractorFactory.createExtractor(docEmbOOXML); assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
else if (embed instanceof XWPFWordExtractor) numWordX++;
} }
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
} }
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint // TODO - PowerPoint
// TODO - Publisher // TODO - Publisher
// TODO - Visio // TODO - Visio
} }
private static final String[] EXPECTED_FAILURES = new String[] { private static final String[] EXPECTED_FAILURES = {
// password protected files // password protected files
"spreadsheet/password.xls", "spreadsheet/password.xls",
"spreadsheet/protected_passtika.xlsx", "spreadsheet/protected_passtika.xlsx",
@ -1018,35 +469,24 @@ public class TestExtractorFactory {
* #59074 - Excel 95 files should give a helpful message, not just * #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream" * "No supported documents found in the OLE2 stream"
*/ */
@Test @Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception { public void bug59074() throws Exception {
try {
ExtractorFactory.createExtractor( ExtractorFactory.createExtractor(
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
fail("Old excel formats not supported via ExtractorFactory");
} catch (OldExcelFormatException e) {
// expected here
}
} }
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
@Test @Test(expected = IllegalStateException.class)
public void testGetEmbeddedFromXMLExtractor() { public void testGetEmbedFromXMLExtractor() {
try {
// currently not implemented // currently not implemented
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null); ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
fail("Unsupported currently");
} catch (IllegalStateException e) {
// expected here
} }
try { @SuppressWarnings("deprecation")
@Test(expected = IllegalStateException.class)
public void testGetEmbeddedFromXMLExtractor() {
// currently not implemented // currently not implemented
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null); ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
fail("Unsupported currently");
} catch (IllegalStateException e) {
// expected here
}
} }
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.

View File

@ -120,10 +120,10 @@ public class TestHxxFEncryption {
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException { public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
Biff8EncryptionKey.setCurrentUserPassword(password); Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file); File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f); POITextExtractor te1 = ExtractorFactory.createExtractor(f);
Biff8EncryptionKey.setCurrentUserPassword(newPass); Biff8EncryptionKey.setCurrentUserPassword(newPass);
ByteArrayOutputStream bos = new ByteArrayOutputStream(); ByteArrayOutputStream bos = new ByteArrayOutputStream();
POIDocument doc = te1.getDocument(); POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos); doc.write(bos);
doc.close(); doc.close();
te1.close(); te1.close();
@ -140,25 +140,25 @@ public class TestHxxFEncryption {
ByteArrayOutputStream bos = new ByteArrayOutputStream(); ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password); Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file); File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f); POITextExtractor te1 = ExtractorFactory.createExtractor(f);
// first remove encryption // first remove encryption
Biff8EncryptionKey.setCurrentUserPassword(null); Biff8EncryptionKey.setCurrentUserPassword(null);
POIDocument doc = te1.getDocument(); POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos); doc.write(bos);
doc.close(); doc.close();
te1.close(); te1.close();
// then use default setting, which is cryptoapi // then use default setting, which is cryptoapi
String newPass = "newPass"; String newPass = "newPass";
POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
Biff8EncryptionKey.setCurrentUserPassword(newPass); Biff8EncryptionKey.setCurrentUserPassword(newPass);
doc = te2.getDocument(); doc = (POIDocument)te2.getDocument();
bos.reset(); bos.reset();
doc.write(bos); doc.write(bos);
doc.close(); doc.close();
te2.close(); te2.close();
// and finally update cryptoapi setting // and finally update cryptoapi setting
POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te3.getDocument(); doc = (POIDocument)te3.getDocument();
// need to cache data (i.e. read all data) before changing the key size // need to cache data (i.e. read all data) before changing the key size
if (doc instanceof HSLFSlideShowImpl) { if (doc instanceof HSLFSlideShowImpl) {
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc; HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
@ -175,8 +175,8 @@ public class TestHxxFEncryption {
doc.close(); doc.close();
te3.close(); te3.close();
// check the setting // check the setting
POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te4.getDocument(); doc = (POIDocument)te4.getDocument();
ei = doc.getEncryptionInfo(); ei = doc.getEncryptionInfo();
assertNotNull(ei); assertNotNull(ei);
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader); assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);

View File

@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePartName; import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.sl.draw.DrawPaint; import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.PaintStyle; import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint; import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint; import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
@ -221,8 +222,8 @@ public class TestXSLFBugs {
* rID2 -> slide3.xml * rID2 -> slide3.xml
*/ */
@Test @Test
public void bug54916() throws Exception { public void bug54916() throws IOException {
XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx"); try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
XSLFSlide slide; XSLFSlide slide;
// Should find 4 slides // Should find 4 slides
@ -230,19 +231,18 @@ public class TestXSLFBugs {
// Check the text, to see we got them in order // Check the text, to see we got them in order
slide = ss.getSlides().get(0); slide = ss.getSlides().get(0);
assertContains(getSlideText(slide), "POI cannot read this"); assertContains(getSlideText(ss, slide), "POI cannot read this");
slide = ss.getSlides().get(1); slide = ss.getSlides().get(1);
assertContains(getSlideText(slide), "POI can read this"); assertContains(getSlideText(ss, slide), "POI can read this");
assertContains(getSlideText(slide), "Has a relationship to another slide"); assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
slide = ss.getSlides().get(2); slide = ss.getSlides().get(2);
assertContains(getSlideText(slide), "POI can read this"); assertContains(getSlideText(ss, slide), "POI can read this");
slide = ss.getSlides().get(3); slide = ss.getSlides().get(3);
assertContains(getSlideText(slide), "POI can read this"); assertContains(getSlideText(ss, slide), "POI can read this");
}
ss.close();
} }
/** /**
@ -311,8 +311,15 @@ public class TestXSLFBugs {
ss.close(); ss.close();
} }
protected String getSlideText(XSLFSlide slide) { protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
return XSLFPowerPointExtractor.getText(slide, true, false, false); try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
// do not auto-close the slideshow
extr.setFilesystem(null);
extr.setSlidesByDefault(true);
extr.setNotesByDefault(false);
extr.setMasterByDefault(false);
return extr.getText(slide);
}
} }
@Test @Test
@ -458,7 +465,7 @@ public class TestXSLFBugs {
for (int i = 0; i < slideTexts.length; i++) { for (int i = 0; i < slideTexts.length; i++) {
XSLFSlide slide = ss.getSlides().get(i); XSLFSlide slide = ss.getSlides().get(i);
assertContains(getSlideText(slide), slideTexts[i]); assertContains(getSlideText(ss, slide), slideTexts[i]);
} }
} }

View File

@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
/** /**
@ -44,21 +45,12 @@ public class TestXSLFPowerPointExtractor {
/** /**
* Get text out of the simple file * Get text out of the simple file
* @throws XmlException
* @throws OpenXML4JException
*/ */
@Test @Test
public void testGetSimpleText() public void testGetSimpleText() throws IOException {
throws IOException, XmlException, OpenXML4JException { try (XMLSlideShow xmlA = openPPTX("sample.pptx");
XMLSlideShow xmlA = openPPTX("sample.pptx"); SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
@SuppressWarnings("resource")
OPCPackage pkg = xmlA.getPackage();
new XSLFPowerPointExtractor(xmlA).close();
new XSLFPowerPointExtractor(pkg).close();
XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xmlA);
extractor.getText(); extractor.getText();
String text = extractor.getText(); String text = extractor.getText();
@ -82,7 +74,10 @@ public class TestXSLFPowerPointExtractor {
// "Fifth level\n"; // "Fifth level\n";
// Just slides, no notes // Just slides, no notes
text = extractor.getText(true, false, false); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
text = extractor.getText();
String slideText = String slideText =
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -97,11 +92,15 @@ public class TestXSLFPowerPointExtractor {
assertEquals(slideText, text); assertEquals(slideText, text);
// Just notes, no slides // Just notes, no slides
text = extractor.getText(false, true); extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text); assertEquals("\n\n1\n\n\n2\n", text);
// Both // Both
text = extractor.getText(true, true, false); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
text = extractor.getText();
String bothText = String bothText =
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -116,7 +115,10 @@ public class TestXSLFPowerPointExtractor {
assertEquals(bothText, text); assertEquals(bothText, text);
// With Slides and Master Text // With Slides and Master Text
text = extractor.getText(true, false, true); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
text = extractor.getText();
String smText = String smText =
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -131,7 +133,10 @@ public class TestXSLFPowerPointExtractor {
assertEquals(smText, text); assertEquals(smText, text);
// With Slides, Notes and Master Text // With Slides, Notes and Master Text
text = extractor.getText(true, true, true); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
text = extractor.getText();
String snmText = String snmText =
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -150,14 +155,14 @@ public class TestXSLFPowerPointExtractor {
extractor.setNotesByDefault(true); extractor.setNotesByDefault(true);
text = extractor.getText(); text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text); assertEquals("\n\n1\n\n\n2\n", text);
}
extractor.close();
xmlA.close();
} }
@Test
public void testGetComments() throws IOException { public void testGetComments() throws IOException {
XMLSlideShow xml = openPPTX("45545_Comment.pptx"); try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setCommentsByDefault(true);
String text = extractor.getText(); String text = extractor.getText();
assertTrue(text.length() > 0); assertTrue(text.length() > 0);
@ -168,18 +173,19 @@ public class TestXSLFPowerPointExtractor {
// Check the authors came through too // Check the authors came through too
assertContains(text, "XPVMWARE01"); assertContains(text, "XPVMWARE01");
}
extractor.close();
xml.close();
} }
@Test
@Ignore("currently slidelayouts aren't yet supported")
public void testGetMasterText() throws Exception { public void testGetMasterText() throws Exception {
XMLSlideShow xml = openPPTX("WithMaster.pptx"); try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setSlidesByDefault(true); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false); extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true); extractor.setMasterByDefault(true);
String text = extractor.getText(); String text = extractor.getText();
assertTrue(text.length() > 0); assertTrue(text.length() > 0);
@ -208,24 +214,20 @@ public class TestXSLFPowerPointExtractor {
"This is the Master Title\n" + "This is the Master Title\n" +
"This text comes from the Master Slide\n"; "This text comes from the Master Slide\n";
assertEquals(wholeText, text); assertEquals(wholeText, text);
}
extractor.close();
xml.close();
} }
@Test @Test
public void testTable() throws Exception { public void testTable() throws Exception {
XMLSlideShow xml = openPPTX("present1.pptx"); try (XMLSlideShow xml = openPPTX("present1.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText(); String text = extractor.getText();
assertTrue(text.length() > 0); assertTrue(text.length() > 0);
// Check comments are there // Check comments are there
assertContains(text, "TEST"); assertContains(text, "TEST");
}
extractor.close();
xml.close();
} }
/** /**
@ -241,8 +243,9 @@ public class TestXSLFPowerPointExtractor {
}; };
for(String extension : extensions) { for(String extension : extensions) {
String filename = "testPPT." + extension; String filename = "testPPT." + extension;
XMLSlideShow xml = openPPTX(filename);
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); try (XMLSlideShow xml = openPPTX(filename);
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText(); String text = extractor.getText();
if (extension.equals("thmx")) { if (extension.equals("thmx")) {
@ -257,58 +260,59 @@ public class TestXSLFPowerPointExtractor {
assertContains(filename, text, "content parsing"); assertContains(filename, text, "content parsing");
assertContains(filename, text, "Different words to test against"); assertContains(filename, text, "Different words to test against");
assertContains(filename, text, "Mystery"); assertContains(filename, text, "Mystery");
}
extractor.close();
xml.close();
} }
} }
@Test @Test
public void test45541() throws Exception { public void test45541() throws IOException, OpenXML4JException, XmlException {
// extract text from a powerpoint that has a header in the notes-element // extract text from a powerpoint that has a header in the notes-element
POITextExtractor extr = ExtractorFactory.createExtractor( final File headerFile = slTests.getFile("45541_Header.pptx");
slTests.getFile("45541_Header.pptx")); try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
String text = extr.getText(); String text = extr.getText();
assertNotNull(text); assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc")); assertFalse("Had: " + text, text.contains("testdoc"));
text = ((XSLFPowerPointExtractor)extr).getText(false, true); extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText();
assertContains(text, "testdoc"); assertContains(text, "testdoc");
extr.close();
assertNotNull(text); assertNotNull(text);
}
// extract text from a powerpoint that has a footer in the master-slide // extract text from a powerpoint that has a footer in the master-slide
extr = ExtractorFactory.createExtractor( final File footerFile = slTests.getFile("45541_Footer.pptx");
slTests.getFile("45541_Footer.pptx")); try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
String text = extr.getText();
assertNotContained(text, "testdoc");
extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText(); text = extr.getText();
assertNotContained(text, "testdoc"); assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor)extr).getText(false, true); extr.setSlidesByDefault(false);
extr.setNotesByDefault(false);
extr.setMasterByDefault(true);
text = extr.getText();
assertNotContained(text, "testdoc"); assertNotContained(text, "testdoc");
}
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
assertNotContained(text, "testdoc");
extr.close();
} }
@Test @Test
public void bug54570() throws IOException { public void bug54570() throws IOException {
XMLSlideShow xml = openPPTX("bug54570.pptx"); try (XMLSlideShow xml = openPPTX("bug54570.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText(); String text = extractor.getText();
assertNotNull(text); assertNotNull(text);
extractor.close(); }
xml.close();
} }
private XMLSlideShow openPPTX(String file) throws IOException { private XMLSlideShow openPPTX(String file) throws IOException {
InputStream is = slTests.openResourceAsStream(file); try (InputStream is = slTests.openResourceAsStream(file)) {
try {
return new XMLSlideShow(is); return new XMLSlideShow(is);
} finally {
is.close();
} }
} }
} }

View File

@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
/** /**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
} }
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
return new PowerPointExtractor(poifsDir); return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
} }
if (poifsDir.hasEntry("VisioDocument")) { if (poifsDir.hasEntry("VisioDocument")) {

View File

@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Removal;
/** /**
* This class can be used to extract text from a PowerPoint file. Can optionally * This class can be used to extract text from a PowerPoint file. Can optionally
@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
@Deprecated @Deprecated
@Removal(version="5.0.0")
public final class PowerPointExtractor extends POIOLE2TextExtractor { public final class PowerPointExtractor extends POIOLE2TextExtractor {
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate; private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;

View File

@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
public void close() throws IOException { public void close() throws IOException {
_hslfSlideShow.close(); _hslfSlideShow.close();
} }
@Override
public Object getPersistDocument() {
return getSlideShowImpl();
}
} }

View File

@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
import java.io.IOException; import java.io.IOException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Internal; import org.apache.poi.util.Internal;
@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
@Internal @Internal
public class HSLFSlideShowFactory extends SlideShowFactory { public class HSLFSlideShowFactory extends SlideShowFactory {
/** /**
* Creates a HSLFSlideShow from the given NPOIFSFileSystem * Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
* <p>Note that in order to properly release resources the * Note that in order to properly release resources the
* SlideShow should be closed after use. * SlideShow should be closed after use.
*/ */
public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException { public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
return new HSLFSlideShow(fs); return new HSLFSlideShow(fs);
} }
/**
* Creates a HSLFSlideShow from the given DirectoryNode<p>
* Note that in order to properly release resources the
* SlideShow should be closed after use.
*/
public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
return new HSLFSlideShow(root);
}
} }

View File

@ -846,11 +846,15 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
@Override @Override
public void close() throws IOException { public void close() throws IOException {
// only close the filesystem, if we are based on the root node.
// embedded documents/slideshows shouldn't close the parent container
if (getDirectory().getParent() == null) {
NPOIFSFileSystem fs = getDirectory().getFileSystem(); NPOIFSFileSystem fs = getDirectory().getFileSystem();
if (fs != null) { if (fs != null) {
fs.close(); fs.close();
} }
} }
}
@Override @Override
protected String getEncryptedPropertyStreamName() { protected String getEncryptedPropertyStreamName() {

View File

@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ObjectShape;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.IOUtils; import org.apache.poi.util.IOUtils;
import org.junit.Test; import org.junit.Test;
@ -76,43 +80,46 @@ public final class TestExtractor {
// ppe.close(); // ppe.close();
// } // }
private PowerPointExtractor openExtractor(String fileName) throws IOException { private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
InputStream is = slTests.openResourceAsStream(fileName); try (InputStream is = slTests.openResourceAsStream(fileName)) {
try { return new SlideShowExtractor(SlideShowFactory.create(is));
return new PowerPointExtractor(is);
} finally {
is.close();
} }
} }
@Test @Test
public void testReadSheetText() throws IOException { public void testReadSheetText() throws IOException {
// Basic 2 page example // Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
assertEquals(expectText, ppe.getText()); assertEquals(expectText, ppe.getText());
ppe.close(); }
// 1 page example with text boxes // 1 page example with text boxes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt"); try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
assertEquals(expectText2, ppe2.getText()); assertEquals(expectText2, ppe.getText());
ppe2.close(); }
} }
@Test @Test
public void testReadNoteText() throws IOException { public void testReadNoteText() throws IOException {
// Basic 2 page example // Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
String notesText = ppe.getNotes(); ppe.setNotesByDefault(true);
ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n"; String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
assertEquals(expText, notesText); assertEquals(expText, notesText);
ppe.close(); }
// Other one doesn't have notes // Other one doesn't have notes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt"); try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
notesText = ppe2.getNotes(); ppe.setNotesByDefault(true);
expText = ""; ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "";
assertEquals(expText, notesText); assertEquals(expText, notesText);
ppe2.close(); }
} }
@Test @Test
@ -126,7 +133,7 @@ public final class TestExtractor {
"\nThese are the notes on page two, again lacking formatting\n" "\nThese are the notes on page two, again lacking formatting\n"
}; };
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
ppe.setSlidesByDefault(true); ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false); ppe.setNotesByDefault(false);
assertEquals(slText[0] + slText[1], ppe.getText()); assertEquals(slText[0] + slText[1], ppe.getText());
@ -138,7 +145,7 @@ public final class TestExtractor {
ppe.setSlidesByDefault(true); ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(true); ppe.setNotesByDefault(true);
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText()); assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
ppe.close(); }
} }
/** /**
@ -149,10 +156,13 @@ public final class TestExtractor {
*/ */
@Test @Test
public void testMissingCoreRecords() throws IOException { public void testMissingCoreRecords() throws IOException {
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt"); try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
ppe.setSlidesByDefault(true);
String text = ppe.getText(true, false); ppe.setNotesByDefault(false);
String nText = ppe.getNotes(); String text = ppe.getText();
ppe.setSlidesByDefault(false);
ppe.setNotesByDefault(true);
String nText = ppe.getText();
assertNotNull(text); assertNotNull(text);
assertNotNull(nText); assertNotNull(nText);
@ -162,32 +172,30 @@ public final class TestExtractor {
// Slide records were fine // Slide records were fine
assertContains(text, "Using Disease Surveillance and Response"); assertContains(text, "Using Disease Surveillance and Response");
}
ppe.close();
} }
@Test @Test
public void testExtractFromEmbeded() throws IOException { public void testExtractFromEmbeded() throws IOException {
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls"); try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
POIFSFileSystem fs = new POIFSFileSystem(is); final POIFSFileSystem fs = new POIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot(); final DirectoryNode root = fs.getRoot();
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
ppe2.close();
ppe1.close();
fs.close();
}
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected) final String[] TEST_SET = {
throws IOException { "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName); "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
};
for (int i=0; i<TEST_SET.length; i+=2) {
DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)); assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
// Check the first file try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir); final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
PowerPointExtractor ppe = new PowerPointExtractor(ppt); assertEquals(TEST_SET[i+1], ppe.getText());
assertEquals(expected, ppe.getText(true, false)); }
return ppe; }
}
} }
/** /**
@ -195,12 +203,12 @@ public final class TestExtractor {
*/ */
@Test @Test
public void testExtractFromOwnEmbeded() throws IOException { public void testExtractFromOwnEmbeded() throws IOException {
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt"); try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
List<HSLFObjectShape> shapes = ppe.getOLEShapes(); List<? extends ObjectShape> shapes = ppe.getOLEShapes();
assertEquals("Expected 6 ole shapes", 6, shapes.size()); assertEquals("Expected 6 ole shapes", 6, shapes.size());
int num_ppt = 0, num_doc = 0, num_xls = 0; int num_ppt = 0, num_doc = 0, num_xls = 0;
for (HSLFObjectShape ole : shapes) { for (ObjectShape ole : shapes) {
String name = ole.getInstanceName(); String name = ((HSLFObjectShape)ole).getInstanceName();
InputStream data = ole.getObjectData().getInputStream(); InputStream data = ole.getObjectData().getInputStream();
if ("Worksheet".equals(name)) { if ("Worksheet".equals(name)) {
HSSFWorkbook wb = new HSSFWorkbook(data); HSSFWorkbook wb = new HSSFWorkbook(data);
@ -220,7 +228,7 @@ public final class TestExtractor {
assertEquals("Expected 2 embedded Word Documents", 2, num_doc); assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls); assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt); assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
ppe.close(); }
} }
/** /**
@ -228,11 +236,11 @@ public final class TestExtractor {
*/ */
@Test @Test
public void test52991() throws IOException { public void test52991() throws IOException {
PowerPointExtractor ppe = openExtractor("badzip.ppt"); try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
for (HSLFObjectShape shape : ppe.getOLEShapes()) { for (ObjectShape shape : ppe.getOLEShapes()) {
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream()); IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
} }
ppe.close(); }
} }
/** /**
@ -240,27 +248,27 @@ public final class TestExtractor {
*/ */
@Test @Test
public void testWithComments() throws IOException { public void testWithComments() throws IOException {
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt"); try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
String text = ppe1.getText(); String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("This is a test comment")); assertFalse("Comments not in by default", text.contains("This is a test comment"));
ppe1.setCommentsByDefault(true); ppe.setCommentsByDefault(true);
text = ppe1.getText(); text = ppe.getText();
assertContains(text, "This is a test comment"); assertContains(text, "This is a test comment");
ppe1.close(); }
// And another file // And another file
PowerPointExtractor ppe2 = openExtractor("45543.ppt"); try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
text = ppe2.getText(); String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("testdoc")); assertFalse("Comments not in by default", text.contains("testdoc"));
ppe2.setCommentsByDefault(true); ppe.setCommentsByDefault(true);
text = ppe2.getText(); text = ppe.getText();
assertContains(text, "testdoc"); assertContains(text, "testdoc");
ppe2.close(); }
} }
/** /**
@ -268,48 +276,37 @@ public final class TestExtractor {
*/ */
@Test @Test
public void testHeaderFooter() throws IOException { public void testHeaderFooter() throws IOException {
String text;
// With a header on the notes // With a header on the notes
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt"); try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
HSLFSlideShow ppt1 = new HSLFSlideShow(is1); HSLFSlideShow ppt = new HSLFSlideShow(is)) {
is1.close();
assertNotNull(ppt1.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl()); assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
text = ppe1.getText(); testHeaderFooterInner(ppt);
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc")); }
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe1.setNotesByDefault(true);
text = ppe1.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
ppe1.close();
ppt1.close();
// And with a footer, also on notes // And with a footer, also on notes
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt"); try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
HSLFSlideShow ppt2 = new HSLFSlideShow(is2); final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
is2.close(); assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
assertNotNull(ppt2.getNotesHeadersFooters()); testHeaderFooterInner(ppt);
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText()); }
ppt2.close(); }
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt"); private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
text = ppe2.getText(); String text = ppe.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc")); assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase")); assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe2.setNotesByDefault(true); ppe.setNotesByDefault(true);
text = ppe2.getText(); text = ppe.getText();
assertContains(text, "testdoc"); assertContains(text, "testdoc");
assertContains(text, "test phrase"); assertContains(text, "test phrase");
ppe2.close(); }
} }
@SuppressWarnings("unused") @SuppressWarnings("unused")
@ -318,41 +315,40 @@ public final class TestExtractor {
String masterTitleText = "This is the Master Title"; String masterTitleText = "This is the Master Title";
String masterRandomText = "This text comes from the Master Slide"; String masterRandomText = "This text comes from the Master Slide";
String masterFooterText = "Footer from the master slide"; String masterFooterText = "Footer from the master slide";
PowerPointExtractor ppe = openExtractor("WithMaster.ppt"); try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
ppe.setMasterByDefault(true); ppe.setMasterByDefault(true);
String text = ppe.getText(); String text = ppe.getText();
assertContains(text, masterRandomText); assertContains(text, masterRandomText);
assertContains(text, masterFooterText); assertContains(text, masterFooterText);
ppe.close(); }
} }
@Test @Test
public void testMasterText() throws IOException { public void testMasterText() throws IOException {
PowerPointExtractor ppe1 = openExtractor("master_text.ppt"); try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
// Initially not there // Initially not there
String text = ppe1.getText(); String text = ppe.getText();
assertFalse(text.contains("Text that I added to the master slide")); assertFalse(text.contains("Text that I added to the master slide"));
// Enable, shows up // Enable, shows up
ppe1.setMasterByDefault(true); ppe.setMasterByDefault(true);
text = ppe1.getText(); text = ppe.getText();
assertContains(text, "Text that I added to the master slide"); assertContains(text, "Text that I added to the master slide");
// Make sure placeholder text does not come out // Make sure placeholder text does not come out
assertNotContained(text, "Click to edit Master"); assertNotContained(text, "Click to edit Master");
ppe1.close(); }
// Now with another file only containing master text // Now with another file only containing master text
// Will always show up // Will always show up
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt"); try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
String masterText = "Footer from the master slide"; String masterText = "Footer from the master slide";
text = ppe2.getText(); String text = ppe.getText();
assertContainsIgnoreCase(text, "master"); assertContainsIgnoreCase(text, "master");
assertContains(text, masterText); assertContains(text, masterText);
ppe2.close(); }
} }
/** /**
@ -360,8 +356,7 @@ public final class TestExtractor {
*/ */
@Test @Test
public void testChineseText() throws IOException { public void testChineseText() throws IOException {
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt"); try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
String text = ppe.getText(); String text = ppe.getText();
// Check for the english text line // Check for the english text line
@ -375,7 +370,7 @@ public final class TestExtractor {
// Check for the chinese only text line // Check for the chinese only text line
assertContains(text, "\uff8a\uff9d\uff76\uff78"); assertContains(text, "\uff8a\uff9d\uff76\uff78");
ppe.close(); }
} }
/** /**
@ -387,67 +382,59 @@ public final class TestExtractor {
public void testDifferentPOIFS() throws IOException { public void testDifferentPOIFS() throws IOException {
// Open the two filesystems // Open the two filesystems
File pptFile = slTests.getFile("basic_test_ppt_file.ppt"); File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
InputStream is1 = new FileInputStream(pptFile); try (final InputStream is1 = new FileInputStream(pptFile);
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1); final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
is1.close();
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile); final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()}; DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
// Open directly // Open directly
for (DirectoryNode dir : files) { for (DirectoryNode dir : files) {
PowerPointExtractor extractor = new PowerPointExtractor(dir); try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
assertEquals(expectText, extractor.getText()); assertEquals(expectText, extractor.getText());
} }
// Open via a HSLFSlideShow
for (DirectoryNode dir : files) {
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
assertEquals(expectText, extractor.getText());
extractor.close();
slideshow.close();
} }
}
npoifs.close();
} }
@Test @Test
public void testTable() throws Exception { public void testTable() throws Exception {
PowerPointExtractor ppe1 = openExtractor("54111.ppt"); try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
String text1 = ppe1.getText(); String text = ppe.getText();
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+ String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" + "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" + "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" + "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" + "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n"; "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
assertContains(text1, target1); assertContains(text, target);
ppe1.close(); }
PowerPointExtractor ppe2 = openExtractor("54722.ppt"); try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
String text2 = ppe2.getText(); String text = ppe.getText();
String target2 = "this\tText\tis\twithin\ta\n" + String target = "this\tText\tis\twithin\ta\n" +
"table\t1\t2\t3\t4"; "table\t1\t2\t3\t4";
assertContains(text2, target2); assertContains(text, target);
ppe2.close(); }
} }
// bug 60003 // bug 60003
@Test @Test
public void testExtractMasterSlideFooterText() throws Exception { public void testExtractMasterSlideFooterText() throws Exception {
PowerPointExtractor ppe = openExtractor("60003.ppt"); try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
ppe.setMasterByDefault(true); ppe.setMasterByDefault(true);
String text = ppe.getText(); String text = ppe.getText();
assertContains(text, "Prague"); assertContains(text, "Prague");
ppe.close(); }
} }
@Test @Test
public void testExtractGroupedShapeText() throws Exception { public void testExtractGroupedShapeText() throws Exception {
try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) { try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
final String text = ppe.getText(); final String text = ppe.getText();
//this tests that we're ignoring text shapes at depth=0 //this tests that we're ignoring text shapes at depth=0

View File

@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.sl.draw.DrawFactory; import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawPaint; import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.draw.DrawTextParagraph; import org.apache.poi.sl.draw.DrawTextParagraph;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ColorStyle; import org.apache.poi.sl.usermodel.ColorStyle;
import org.apache.poi.sl.usermodel.PaintStyle; import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint; import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
@ -800,18 +801,18 @@ public final class TestBugs {
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", }; String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
for (String f : files) { for (String f : files) {
File sample = HSLFTestDataSamples.getSampleFile(f); File sample = HSLFTestDataSamples.getSampleFile(f);
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath()); try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText()); assertNotNull(ex.getText());
ex.close(); }
} }
} }
@Test @Test
public void bug58733() throws IOException { public void bug58733() throws IOException {
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt"); File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath()); try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText()); assertNotNull(ex.getText());
ex.close(); }
} }
@Test @Test

Binary file not shown.