#62319 - Decommission XSLF-/PowerPointExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bc436fcc3d
commit
ab390ce170
@ -330,8 +330,6 @@ public class TestAllFiles {
|
||||
);
|
||||
|
||||
private static final Set<String> IGNORED = unmodifiableHashSet(
|
||||
// need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
|
||||
"slideshow/42474-2.ppt",
|
||||
// OPC handler works / XSSF handler fails
|
||||
"spreadsheet/57181.xlsm",
|
||||
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop
|
||||
|
@ -24,6 +24,7 @@ import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||
@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
|
||||
|
||||
// additionally try the other getText() methods
|
||||
|
||||
try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
|
||||
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
|
||||
assertNotNull(extractor);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(true);
|
||||
extractor.setMasterByDefault(true);
|
||||
|
||||
assertNotNull(extractor.getText(true, true, true));
|
||||
assertEquals("With all options disabled we should not get text",
|
||||
"", extractor.getText(false, false, false));
|
||||
assertNotNull(extractor.getText());
|
||||
|
||||
extractor.setSlidesByDefault(false);
|
||||
extractor.setNotesByDefault(false);
|
||||
extractor.setMasterByDefault(false);
|
||||
|
||||
assertEquals("With all options disabled we should not get text", "", extractor.getText());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
||||
*
|
||||
* @return the underlying POIDocument
|
||||
*/
|
||||
@Override
|
||||
public POIDocument getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
|
||||
fsToClose.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the processed document
|
||||
*/
|
||||
public abstract Object getDocument();
|
||||
}
|
||||
|
@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
return (T)createExtractor(fs.getRoot());
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
|
||||
return (T)createExtractor(fs.getRoot());
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
|
||||
return (T)createExtractor(fs.getRoot());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream input) throws IOException {
|
||||
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
|
||||
Class<?> cls = getOOXMLClass();
|
||||
if (cls != null) {
|
||||
// Use Reflection to get us the full OOXML-enabled version
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
|
||||
return (POITextExtractor)m.invoke(null, input);
|
||||
return (T)m.invoke(null, input);
|
||||
} catch (IllegalArgumentException iae) {
|
||||
throw iae;
|
||||
} catch (Exception e) {
|
||||
|
@ -45,7 +45,29 @@ public class DocumentFactoryHelper {
|
||||
*/
|
||||
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
|
||||
throws IOException {
|
||||
EncryptionInfo info = new EncryptionInfo(fs);
|
||||
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
|
||||
// as well when the resulting OPCPackage is closed
|
||||
return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
fs.close();
|
||||
super.close();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
|
||||
* the given password.
|
||||
*
|
||||
* @param root The OLE2 directory node for the document
|
||||
* @param password The password, null if the default password should be used
|
||||
* @return A stream for reading the decrypted data
|
||||
* @throws IOException If an error occurs while decrypting or if the password does not match
|
||||
*/
|
||||
public static InputStream getDecryptedStream(final DirectoryNode root, String password)
|
||||
throws IOException {
|
||||
EncryptionInfo info = new EncryptionInfo(root);
|
||||
Decryptor d = Decryptor.getInstance(info);
|
||||
|
||||
try {
|
||||
@ -58,20 +80,10 @@ public class DocumentFactoryHelper {
|
||||
}
|
||||
|
||||
if (passwordCorrect) {
|
||||
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
|
||||
// as well when the resulting OPCPackage is closed
|
||||
return new FilterInputStream(d.getDataStream(fs.getRoot())) {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
fs.close();
|
||||
|
||||
super.close();
|
||||
}
|
||||
};
|
||||
} else {
|
||||
if (password != null)
|
||||
return d.getDataStream(root);
|
||||
} else if (password != null) {
|
||||
throw new EncryptedDocumentException("Password incorrect");
|
||||
else
|
||||
} else {
|
||||
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
|
||||
}
|
||||
} catch (GeneralSecurityException e) {
|
||||
|
@ -1,3 +1,20 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.sl.extractor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -48,6 +65,16 @@ public class SlideShowExtractor<
|
||||
this.slideshow = slideshow;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns opened document
|
||||
*
|
||||
* @return the opened document
|
||||
*/
|
||||
@Override
|
||||
public final Object getDocument() {
|
||||
return slideshow.getPersistDocument();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return slide text? Default is yes
|
||||
*/
|
||||
@ -219,7 +246,6 @@ public class SlideShowExtractor<
|
||||
return;
|
||||
}
|
||||
for (final P para : paraList) {
|
||||
final int oldLen = sb.length();
|
||||
for (final TextRun tr : para) {
|
||||
final String str = tr.getRawText().replace("\r", "");
|
||||
final String newStr;
|
||||
|
@ -126,4 +126,13 @@ public interface SlideShow<
|
||||
* @since POI 4.0.0
|
||||
*/
|
||||
POITextExtractor getMetadataTextExtractor();
|
||||
|
||||
/**
|
||||
* @return the instance which handles the persisting of the slideshow,
|
||||
* which is either a subclass of {@link org.apache.poi.POIDocument}
|
||||
* or {@link org.apache.poi.POIXMLDocument}
|
||||
*
|
||||
* @since POI 4.0.0
|
||||
*/
|
||||
Object getPersistDocument();
|
||||
}
|
||||
|
@ -60,13 +60,40 @@ public class SlideShowFactory {
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
*/
|
||||
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
|
||||
DirectoryNode root = fs.getRoot();
|
||||
return create(fs.getRoot(), password);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SlideShow from the given NPOIFSFileSystem.
|
||||
*
|
||||
* @param root The {@link DirectoryNode} to start reading the document from
|
||||
*
|
||||
* @return The created SlideShow
|
||||
*
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
*/
|
||||
public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
|
||||
return create(root, null);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a SlideShow from the given NPOIFSFileSystem, which may
|
||||
* be password protected
|
||||
*
|
||||
* @param root The {@link DirectoryNode} to start reading the document from
|
||||
* @param password The password that should be used or null if no password is necessary.
|
||||
*
|
||||
* @return The created SlideShow
|
||||
*
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
*/
|
||||
public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
|
||||
// Encrypted OOXML files go inside OLE2 containers, is this one?
|
||||
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||
InputStream stream = null;
|
||||
try {
|
||||
stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
|
||||
stream = DocumentFactoryHelper.getDecryptedStream(root, password);
|
||||
|
||||
return createXSLFSlideShow(stream);
|
||||
} finally {
|
||||
@ -82,7 +109,7 @@ public class SlideShowFactory {
|
||||
passwordSet = true;
|
||||
}
|
||||
try {
|
||||
return createHSLFSlideShow(fs);
|
||||
return createHSLFSlideShow(root);
|
||||
} finally {
|
||||
if (passwordSet) {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(null);
|
||||
|
@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
||||
*
|
||||
* @return the opened document
|
||||
*/
|
||||
@Override
|
||||
public final POIXMLDocument getDocument() {
|
||||
return _document;
|
||||
}
|
||||
|
@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
|
||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.util.NotImplemented;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.util.Removal;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||
@ -127,20 +129,20 @@ public class ExtractorFactory {
|
||||
return OLE2ExtractorFactory.getPreferEventExtractor();
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
|
||||
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
|
||||
NPOIFSFileSystem fs = null;
|
||||
try {
|
||||
fs = new NPOIFSFileSystem(f);
|
||||
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||
return createEncryptedOOXMLExtractor(fs);
|
||||
return (T)createEncryptedOOXMLExtractor(fs);
|
||||
}
|
||||
POIOLE2TextExtractor extractor = createExtractor(fs);
|
||||
POITextExtractor extractor = createExtractor(fs);
|
||||
extractor.setFilesystem(fs);
|
||||
return extractor;
|
||||
return (T)extractor;
|
||||
} catch (OfficeXmlFileException e) {
|
||||
// ensure file-handle release
|
||||
IOUtils.closeQuietly(fs);
|
||||
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||
return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||
} catch (NotOLE2FileException ne) {
|
||||
// ensure file-handle release
|
||||
IOUtils.closeQuietly(fs);
|
||||
@ -179,7 +181,7 @@ public class ExtractorFactory {
|
||||
* @throws XmlException If an XML parsing error occurs.
|
||||
* @throws IllegalArgumentException If no matching file type could be found.
|
||||
*/
|
||||
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||
try {
|
||||
// Check for the normal Office core document
|
||||
PackageRelationshipCollection core;
|
||||
@ -226,13 +228,13 @@ public class ExtractorFactory {
|
||||
// Is it XSLF?
|
||||
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||
if ( rel.getContentType().equals( contentType ) ) {
|
||||
return new XSLFPowerPointExtractor(pkg);
|
||||
return new SlideShowExtractor(new XMLSlideShow(pkg));
|
||||
}
|
||||
}
|
||||
|
||||
// special handling for SlideShow-Theme-files,
|
||||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
|
||||
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
||||
return new SlideShowExtractor(new XMLSlideShow(pkg));
|
||||
}
|
||||
|
||||
// How about xlsb?
|
||||
@ -252,28 +254,28 @@ public class ExtractorFactory {
|
||||
}
|
||||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return createExtractor(fs.getRoot());
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return createExtractor(fs.getRoot());
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return createExtractor(fs.getRoot());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
|
||||
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
|
||||
{
|
||||
// First, check for OOXML
|
||||
for (String entryName : poifsDir.getEntryNames()) {
|
||||
if (entryName.equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||
return createExtractor(pkg);
|
||||
return (T)createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
||||
return OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -403,7 +405,7 @@ public class ExtractorFactory {
|
||||
throw new IllegalStateException("Not yet supported");
|
||||
}
|
||||
|
||||
private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
|
||||
private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
|
||||
throws IOException {
|
||||
String pass = Biff8EncryptionKey.getCurrentUserPassword();
|
||||
if (pass == null) {
|
||||
|
@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
|
||||
* @deprecated use {@link SlideShowExtractor}
|
||||
*/
|
||||
@Deprecated
|
||||
@Removal(version="4.2.0")
|
||||
@Removal(version="5.0.0")
|
||||
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
|
||||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
|
||||
|
@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
|
||||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||
return new POIXMLPropertiesTextExtractor(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getPersistDocument() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,20 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.xslf.usermodel;
|
||||
|
||||
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;
|
||||
|
@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
|
||||
*/
|
||||
public XSLFCommentAuthors getCommentAuthorsPart() {
|
||||
if(_commentAuthors == null) {
|
||||
// first scan the slide relations
|
||||
for (POIXMLDocumentPart p : getRelations()) {
|
||||
if (p instanceof XSLFCommentAuthors) {
|
||||
_commentAuthors = (XSLFCommentAuthors)p;
|
||||
return _commentAuthors;
|
||||
}
|
||||
}
|
||||
// then scan the presentation relations
|
||||
for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
|
||||
if (p instanceof XSLFCommentAuthors) {
|
||||
_commentAuthors = (XSLFCommentAuthors)p;
|
||||
return _commentAuthors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
@ -27,16 +27,15 @@ import static org.junit.Assert.fail;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.POIXMLException;
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.UnsupportedFileFormatException;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.hssf.OldExcelFormatException;
|
||||
@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.junit.BeforeClass;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
@ -65,34 +66,39 @@ public class TestExtractorFactory {
|
||||
|
||||
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
|
||||
|
||||
private static File txt;
|
||||
private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
||||
private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
|
||||
private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
|
||||
private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
|
||||
private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
|
||||
private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
|
||||
private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
|
||||
|
||||
private static File xls;
|
||||
private static File xlsx;
|
||||
private static File xlsxStrict;
|
||||
private static File xltx;
|
||||
private static File xlsEmb;
|
||||
private static File xlsb;
|
||||
private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||
private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
|
||||
private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
|
||||
private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
|
||||
private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
|
||||
private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
|
||||
private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
|
||||
private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
|
||||
|
||||
private static File doc;
|
||||
private static File doc6;
|
||||
private static File doc95;
|
||||
private static File docx;
|
||||
private static File dotx;
|
||||
private static File docEmb;
|
||||
private static File docEmbOOXML;
|
||||
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
|
||||
private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
|
||||
private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
|
||||
|
||||
private static File ppt;
|
||||
private static File pptx;
|
||||
private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||
private static final File msg = getFileAndCheck(olTests, "quick.msg");
|
||||
private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
|
||||
private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
|
||||
|
||||
private static File msg;
|
||||
private static File msgEmb;
|
||||
private static File msgEmbMsg;
|
||||
private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||
private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
|
||||
private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
|
||||
|
||||
private static File vsd;
|
||||
private static File vsdx;
|
||||
|
||||
private static File pub;
|
||||
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
||||
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
|
||||
|
||||
private static File getFileAndCheck(POIDataSamples samples, String name) {
|
||||
File file = samples.getFile(name);
|
||||
@ -104,595 +110,133 @@ public class TestExtractorFactory {
|
||||
return file;
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
private static final Object[] TEST_SET = {
|
||||
"Excel", xls, ExcelExtractor.class, 200,
|
||||
"Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
|
||||
"Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
|
||||
"Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
|
||||
"Word", doc, WordExtractor.class, 120,
|
||||
"Word - docx", docx, XWPFWordExtractor.class, 120,
|
||||
"Word - dotx", dotx, XWPFWordExtractor.class, -1,
|
||||
"Word 6", doc6, Word6Extractor.class, 20,
|
||||
"Word 95", doc95, Word6Extractor.class, 120,
|
||||
"PowerPoint", ppt, SlideShowExtractor.class, 120,
|
||||
"PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
|
||||
"Visio", vsd, VisioTextExtractor.class, 50,
|
||||
"Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
|
||||
"Publisher", pub, PublisherTextExtractor.class, 50,
|
||||
"Outlook msg", msg, OutlookTextExtactor.class, 50,
|
||||
|
||||
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
||||
xls = getFileAndCheck(ssTests, "SampleSS.xls");
|
||||
xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
|
||||
xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
|
||||
xltx = getFileAndCheck(ssTests, "test.xltx");
|
||||
xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
|
||||
xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
|
||||
// TODO Support OOXML-Strict, see bug #57699
|
||||
// xlsxStrict
|
||||
};
|
||||
|
||||
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||
doc = getFileAndCheck(wpTests, "SampleDoc.doc");
|
||||
doc6 = getFileAndCheck(wpTests, "Word6.doc");
|
||||
doc95 = getFileAndCheck(wpTests, "Word95.doc");
|
||||
docx = getFileAndCheck(wpTests, "SampleDoc.docx");
|
||||
dotx = getFileAndCheck(wpTests, "test.dotx");
|
||||
docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
|
||||
docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
|
||||
|
||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
ppt = getFileAndCheck(slTests, "SampleShow.ppt");
|
||||
pptx = getFileAndCheck(slTests, "SampleShow.pptx");
|
||||
txt = getFileAndCheck(slTests, "SampleShow.txt");
|
||||
|
||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||
vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
|
||||
vsdx = getFileAndCheck(dgTests, "test.vsdx");
|
||||
|
||||
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
||||
pub = getFileAndCheck(pubTests, "Simple.pub");
|
||||
|
||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||
msg = getFileAndCheck(olTests, "quick.msg");
|
||||
msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
|
||||
msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
|
||||
@FunctionalInterface
|
||||
interface FunctionEx<T, R> {
|
||||
R apply(T t) throws IOException, OpenXML4JException, XmlException;
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFile() throws Exception {
|
||||
// Excel
|
||||
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
|
||||
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
|
||||
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
|
||||
xlsExtractor
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
xlsExtractor.getText().length() > 200
|
||||
);
|
||||
xlsExtractor.close();
|
||||
|
||||
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(xlsx);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(xltx);
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(xlsb);
|
||||
assertContains(extractor.getText(), "test");
|
||||
extractor.close();
|
||||
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(xltx);
|
||||
assertContains(extractor.getText(), "test");
|
||||
extractor.close();
|
||||
|
||||
// TODO Support OOXML-Strict, see bug #57699
|
||||
try {
|
||||
/*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
|
||||
fail("OOXML-Strict isn't yet supported");
|
||||
} catch (POIXMLException e) {
|
||||
// Expected, for now
|
||||
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||
try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
|
||||
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
||||
// assertTrue(
|
||||
// extractor
|
||||
// instanceof XSSFExcelExtractor
|
||||
// );
|
||||
// extractor.close();
|
||||
//
|
||||
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
||||
// assertTrue(
|
||||
// extractor.getText().contains("test")
|
||||
// );
|
||||
// extractor.close();
|
||||
|
||||
|
||||
// Word
|
||||
extractor = ExtractorFactory.createExtractor(doc);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(doc6);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(doc95);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(docx);
|
||||
assertTrue(
|
||||
extractor instanceof XWPFWordExtractor
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(docx);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(dotx);
|
||||
assertTrue(
|
||||
extractor instanceof XWPFWordExtractor
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(dotx);
|
||||
assertContains(extractor.getText(), "Test");
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint (PPT)
|
||||
extractor = ExtractorFactory.createExtractor(ppt);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint (PPTX)
|
||||
extractor = ExtractorFactory.createExtractor(pptx);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XSLFPowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio - binary
|
||||
extractor = ExtractorFactory.createExtractor(vsd);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio - vsdx
|
||||
extractor = ExtractorFactory.createExtractor(vsdx);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Publisher
|
||||
extractor = ExtractorFactory.createExtractor(pub);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof PublisherTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Outlook msg
|
||||
extractor = ExtractorFactory.createExtractor(msg);
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof OutlookTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testFileInvalid() throws Exception {
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(txt);
|
||||
fail("expected IllegalArgumentException");
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
}
|
||||
try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInputStream() throws Exception {
|
||||
// Excel
|
||||
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
// TODO Support OOXML-Strict, see bug #57699
|
||||
// assertTrue(
|
||||
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
|
||||
// instanceof XSSFExcelExtractor
|
||||
// );
|
||||
// assertTrue(
|
||||
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
|
||||
// );
|
||||
extractor.close();
|
||||
|
||||
// Word
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XWPFWordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XSLFPowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio - vsdx
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Publisher
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof PublisherTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Outlook msg
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof OutlookTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 50
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Text
|
||||
try (FileInputStream stream = new FileInputStream(txt)) {
|
||||
ExtractorFactory.createExtractor(stream);
|
||||
fail("expected IllegalArgumentException");
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
testStream((f) -> ExtractorFactory.createExtractor(f), true);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testInputStreamInvalid() throws Exception {
|
||||
testInvalid((f) -> ExtractorFactory.createExtractor(f));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPOIFS() throws Exception {
|
||||
// Excel
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||
);
|
||||
|
||||
// Word
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
||||
);
|
||||
|
||||
// PowerPoint
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
||||
);
|
||||
|
||||
// Visio
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Publisher
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
|
||||
instanceof PublisherTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Outlook msg
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
|
||||
instanceof OutlookTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
||||
fail("expected IllegalArgumentException");
|
||||
} catch(IOException e) {
|
||||
// Good
|
||||
}
|
||||
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
|
||||
}
|
||||
|
||||
@Test(expected = IOException.class)
|
||||
public void testPOIFSInvalid() throws Exception {
|
||||
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOPOIFS() throws Exception {
|
||||
// Excel
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||
);
|
||||
testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
|
||||
}
|
||||
|
||||
// Word
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
||||
);
|
||||
@Test(expected = IOException.class)
|
||||
public void testOPOIFSInvalid() throws Exception {
|
||||
testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
|
||||
}
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
||||
);
|
||||
private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
|
||||
throws IOException, OpenXML4JException, XmlException {
|
||||
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||
File testFile = (File) TEST_SET[i + 1];
|
||||
if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
|
||||
continue;
|
||||
}
|
||||
try (FileInputStream fis = new FileInputStream(testFile);
|
||||
POITextExtractor ext = poifsIS.apply(fis)) {
|
||||
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||
} catch (IllegalArgumentException e) {
|
||||
fail("failed to process "+testFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PowerPoint
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
||||
);
|
||||
|
||||
// Visio
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Publisher
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
|
||||
instanceof PublisherTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Outlook msg
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
|
||||
instanceof OutlookTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
||||
);
|
||||
private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
|
||||
assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
|
||||
final String actual = ext.getText();
|
||||
if (minLength == -1) {
|
||||
assertContains(actual.toLowerCase(Locale.ROOT), "test");
|
||||
} else {
|
||||
assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
|
||||
}
|
||||
}
|
||||
|
||||
private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
|
||||
fail("expected IllegalArgumentException");
|
||||
} catch(IOException e) {
|
||||
// Good
|
||||
try (FileInputStream fis = new FileInputStream(txt);
|
||||
POITextExtractor te = poifs.apply(fis)) {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPackage() throws Exception {
|
||||
// Excel
|
||||
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
||||
assertTrue(extractor instanceof XSSFExcelExtractor);
|
||||
extractor.close();
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
|
||||
assertTrue(extractor.getText().length() > 200);
|
||||
extractor.close();
|
||||
|
||||
// Word
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
||||
assertTrue(extractor instanceof XWPFWordExtractor);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
||||
assertTrue(extractor.getText().length() > 120);
|
||||
extractor.close();
|
||||
|
||||
// PowerPoint
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
||||
assertTrue(extractor instanceof XSLFPowerPointExtractor);
|
||||
extractor.close();
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
||||
assertTrue(extractor.getText().length() > 120);
|
||||
extractor.close();
|
||||
|
||||
// Visio
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
|
||||
assertTrue(extractor instanceof XDGFVisioExtractor);
|
||||
assertTrue(extractor.getText().length() > 20);
|
||||
extractor.close();
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
|
||||
fail("TestExtractorFactory.testPackage() failed on " + txt);
|
||||
} catch(UnsupportedFileFormatException e) {
|
||||
// Good
|
||||
} catch (Exception e) {
|
||||
LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
|
||||
throw e;
|
||||
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||
final File testFile = (File) TEST_SET[i + 1];
|
||||
if (!testFile.getName().endsWith("x")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
|
||||
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
|
||||
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||
pkg.revert();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = UnsupportedFileFormatException.class)
|
||||
public void testPackageInvalid() throws Exception {
|
||||
// Text
|
||||
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
|
||||
final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -781,142 +325,49 @@ public class TestExtractorFactory {
|
||||
* does poifs embedded, but will do ooxml ones
|
||||
* at some point.
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test
|
||||
public void testEmbedded() throws Exception {
|
||||
POIOLE2TextExtractor ext;
|
||||
POITextExtractor[] embeds;
|
||||
final Object[] testObj = {
|
||||
"No embeddings", xls, "0-0-0-0-0-0",
|
||||
"Excel", xlsEmb, "6-2-2-2-0-0",
|
||||
"Word", docEmb, "4-1-2-1-0-0",
|
||||
"Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
|
||||
"Outlook", msgEmb, "1-1-0-0-0-0",
|
||||
"Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
|
||||
};
|
||||
|
||||
// No embeddings
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
ext.close();
|
||||
for (int i=0; i<testObj.length; i+=3) {
|
||||
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
|
||||
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
||||
|
||||
// No embeddings
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
ext.close();
|
||||
|
||||
// Excel
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xlsEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertNotNull(embeds);
|
||||
ext.close();
|
||||
|
||||
// Excel
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xlsEmb);
|
||||
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
||||
|
||||
assertEquals(6, embeds.length);
|
||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
|
||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
|
||||
for (POITextExtractor embed : embeds) {
|
||||
assertTrue(embed.getText().length() > 20);
|
||||
|
||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
||||
else if (embed instanceof ExcelExtractor) numXls++;
|
||||
else if (embed instanceof WordExtractor) numWord++;
|
||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
||||
if (embed instanceof SlideShowExtractor) {
|
||||
numPpt++;
|
||||
} else if (embed instanceof ExcelExtractor) {
|
||||
numXls++;
|
||||
} else if (embed instanceof WordExtractor) {
|
||||
numWord++;
|
||||
} else if (embed instanceof OutlookTextExtactor) {
|
||||
numMsg++;
|
||||
} else if (embed instanceof XWPFWordExtractor) {
|
||||
numWordX++;
|
||||
}
|
||||
assertEquals(2, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(2, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Word
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(docEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||
assertEquals(4, embeds.length);
|
||||
for (POITextExtractor embed : embeds) {
|
||||
assertTrue(embed.getText().length() > 20);
|
||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
||||
else if (embed instanceof ExcelExtractor) numXls++;
|
||||
else if (embed instanceof WordExtractor) numWord++;
|
||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
||||
}
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(1, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Word which contains an OOXML file
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(docEmbOOXML);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
|
||||
assertEquals(3, embeds.length);
|
||||
for (POITextExtractor embed : embeds) {
|
||||
assertTrue(embed.getText().length() > 20);
|
||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
||||
else if (embed instanceof ExcelExtractor) numXls++;
|
||||
else if (embed instanceof WordExtractor) numWord++;
|
||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
||||
else if (embed instanceof XWPFWordExtractor) numWordX++;
|
||||
final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
|
||||
final String expected = (String)testObj[i+2];
|
||||
assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
|
||||
}
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(1, numXls);
|
||||
assertEquals(0, numWord);
|
||||
assertEquals(1, numWordX);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Outlook
|
||||
ext = (OutlookTextExtactor)
|
||||
ExtractorFactory.createExtractor(msgEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||
assertEquals(1, embeds.length);
|
||||
for (POITextExtractor embed : embeds) {
|
||||
assertTrue(embed.getText().length() > 20);
|
||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
||||
else if (embed instanceof ExcelExtractor) numXls++;
|
||||
else if (embed instanceof WordExtractor) numWord++;
|
||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
||||
}
|
||||
assertEquals(0, numPpt);
|
||||
assertEquals(0, numXls);
|
||||
assertEquals(1, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
ext.close();
|
||||
|
||||
// Outlook with another outlook file in it
|
||||
ext = (OutlookTextExtactor)
|
||||
ExtractorFactory.createExtractor(msgEmbMsg);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||
assertEquals(1, embeds.length);
|
||||
for (POITextExtractor embed : embeds) {
|
||||
assertTrue(embed.getText().length() > 20);
|
||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
||||
else if (embed instanceof ExcelExtractor) numXls++;
|
||||
else if (embed instanceof WordExtractor) numWord++;
|
||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
||||
}
|
||||
assertEquals(0, numPpt);
|
||||
assertEquals(0, numXls);
|
||||
assertEquals(0, numWord);
|
||||
assertEquals(1, numMsg);
|
||||
ext.close();
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Publisher
|
||||
// TODO - Visio
|
||||
}
|
||||
|
||||
private static final String[] EXPECTED_FAILURES = new String[] {
|
||||
private static final String[] EXPECTED_FAILURES = {
|
||||
// password protected files
|
||||
"spreadsheet/password.xls",
|
||||
"spreadsheet/protected_passtika.xlsx",
|
||||
@ -1018,35 +469,24 @@ public class TestExtractorFactory {
|
||||
* #59074 - Excel 95 files should give a helpful message, not just
|
||||
* "No supported documents found in the OLE2 stream"
|
||||
*/
|
||||
@Test
|
||||
@Test(expected = OldExcelFormatException.class)
|
||||
public void bug59074() throws Exception {
|
||||
try {
|
||||
ExtractorFactory.createExtractor(
|
||||
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
|
||||
fail("Old excel formats not supported via ExtractorFactory");
|
||||
} catch (OldExcelFormatException e) {
|
||||
// expected here
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test
|
||||
public void testGetEmbeddedFromXMLExtractor() {
|
||||
try {
|
||||
@Test(expected = IllegalStateException.class)
|
||||
public void testGetEmbedFromXMLExtractor() {
|
||||
// currently not implemented
|
||||
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
|
||||
fail("Unsupported currently");
|
||||
} catch (IllegalStateException e) {
|
||||
// expected here
|
||||
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
|
||||
}
|
||||
|
||||
try {
|
||||
@SuppressWarnings("deprecation")
|
||||
@Test(expected = IllegalStateException.class)
|
||||
public void testGetEmbeddedFromXMLExtractor() {
|
||||
// currently not implemented
|
||||
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
|
||||
fail("Unsupported currently");
|
||||
} catch (IllegalStateException e) {
|
||||
// expected here
|
||||
}
|
||||
}
|
||||
|
||||
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
|
||||
|
@ -120,10 +120,10 @@ public class TestHxxFEncryption {
|
||||
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
File f = sampleDir.getFile(file);
|
||||
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
|
||||
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
|
||||
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||
POIDocument doc = te1.getDocument();
|
||||
POIDocument doc = (POIDocument)te1.getDocument();
|
||||
doc.write(bos);
|
||||
doc.close();
|
||||
te1.close();
|
||||
@ -140,25 +140,25 @@ public class TestHxxFEncryption {
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
File f = sampleDir.getFile(file);
|
||||
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
|
||||
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
|
||||
// first remove encryption
|
||||
Biff8EncryptionKey.setCurrentUserPassword(null);
|
||||
POIDocument doc = te1.getDocument();
|
||||
POIDocument doc = (POIDocument)te1.getDocument();
|
||||
doc.write(bos);
|
||||
doc.close();
|
||||
te1.close();
|
||||
// then use default setting, which is cryptoapi
|
||||
String newPass = "newPass";
|
||||
POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
||||
doc = te2.getDocument();
|
||||
doc = (POIDocument)te2.getDocument();
|
||||
bos.reset();
|
||||
doc.write(bos);
|
||||
doc.close();
|
||||
te2.close();
|
||||
// and finally update cryptoapi setting
|
||||
POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
doc = te3.getDocument();
|
||||
POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
doc = (POIDocument)te3.getDocument();
|
||||
// need to cache data (i.e. read all data) before changing the key size
|
||||
if (doc instanceof HSLFSlideShowImpl) {
|
||||
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
|
||||
@ -175,8 +175,8 @@ public class TestHxxFEncryption {
|
||||
doc.close();
|
||||
te3.close();
|
||||
// check the setting
|
||||
POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
doc = te4.getDocument();
|
||||
POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||
doc = (POIDocument)te4.getDocument();
|
||||
ei = doc.getEncryptionInfo();
|
||||
assertNotNull(ei);
|
||||
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);
|
||||
|
@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackagePartName;
|
||||
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
|
||||
import org.apache.poi.sl.draw.DrawPaint;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.PaintStyle;
|
||||
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
||||
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
|
||||
@ -221,8 +222,8 @@ public class TestXSLFBugs {
|
||||
* rID2 -> slide3.xml
|
||||
*/
|
||||
@Test
|
||||
public void bug54916() throws Exception {
|
||||
XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
|
||||
public void bug54916() throws IOException {
|
||||
try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
|
||||
XSLFSlide slide;
|
||||
|
||||
// Should find 4 slides
|
||||
@ -230,19 +231,18 @@ public class TestXSLFBugs {
|
||||
|
||||
// Check the text, to see we got them in order
|
||||
slide = ss.getSlides().get(0);
|
||||
assertContains(getSlideText(slide), "POI cannot read this");
|
||||
assertContains(getSlideText(ss, slide), "POI cannot read this");
|
||||
|
||||
slide = ss.getSlides().get(1);
|
||||
assertContains(getSlideText(slide), "POI can read this");
|
||||
assertContains(getSlideText(slide), "Has a relationship to another slide");
|
||||
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||
assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
|
||||
|
||||
slide = ss.getSlides().get(2);
|
||||
assertContains(getSlideText(slide), "POI can read this");
|
||||
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||
|
||||
slide = ss.getSlides().get(3);
|
||||
assertContains(getSlideText(slide), "POI can read this");
|
||||
|
||||
ss.close();
|
||||
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -311,8 +311,15 @@ public class TestXSLFBugs {
|
||||
ss.close();
|
||||
}
|
||||
|
||||
protected String getSlideText(XSLFSlide slide) {
|
||||
return XSLFPowerPointExtractor.getText(slide, true, false, false);
|
||||
protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
|
||||
try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
|
||||
// do not auto-close the slideshow
|
||||
extr.setFilesystem(null);
|
||||
extr.setSlidesByDefault(true);
|
||||
extr.setNotesByDefault(false);
|
||||
extr.setMasterByDefault(false);
|
||||
return extr.getText(slide);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -458,7 +465,7 @@ public class TestXSLFBugs {
|
||||
|
||||
for (int i = 0; i < slideTexts.length; i++) {
|
||||
XSLFSlide slide = ss.getSlides().get(i);
|
||||
assertContains(getSlideText(slide), slideTexts[i]);
|
||||
assertContains(getSlideText(ss, slide), slideTexts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
@ -44,21 +45,12 @@ public class TestXSLFPowerPointExtractor {
|
||||
|
||||
/**
|
||||
* Get text out of the simple file
|
||||
* @throws XmlException
|
||||
* @throws OpenXML4JException
|
||||
*/
|
||||
@Test
|
||||
public void testGetSimpleText()
|
||||
throws IOException, XmlException, OpenXML4JException {
|
||||
XMLSlideShow xmlA = openPPTX("sample.pptx");
|
||||
@SuppressWarnings("resource")
|
||||
OPCPackage pkg = xmlA.getPackage();
|
||||
public void testGetSimpleText() throws IOException {
|
||||
try (XMLSlideShow xmlA = openPPTX("sample.pptx");
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
|
||||
|
||||
new XSLFPowerPointExtractor(xmlA).close();
|
||||
new XSLFPowerPointExtractor(pkg).close();
|
||||
|
||||
XSLFPowerPointExtractor extractor =
|
||||
new XSLFPowerPointExtractor(xmlA);
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
@ -82,7 +74,10 @@ public class TestXSLFPowerPointExtractor {
|
||||
// "Fifth level\n";
|
||||
|
||||
// Just slides, no notes
|
||||
text = extractor.getText(true, false, false);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(false);
|
||||
extractor.setMasterByDefault(false);
|
||||
text = extractor.getText();
|
||||
String slideText =
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
@ -97,11 +92,15 @@ public class TestXSLFPowerPointExtractor {
|
||||
assertEquals(slideText, text);
|
||||
|
||||
// Just notes, no slides
|
||||
text = extractor.getText(false, true);
|
||||
extractor.setSlidesByDefault(false);
|
||||
extractor.setNotesByDefault(true);
|
||||
text = extractor.getText();
|
||||
assertEquals("\n\n1\n\n\n2\n", text);
|
||||
|
||||
// Both
|
||||
text = extractor.getText(true, true, false);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(true);
|
||||
text = extractor.getText();
|
||||
String bothText =
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
@ -116,7 +115,10 @@ public class TestXSLFPowerPointExtractor {
|
||||
assertEquals(bothText, text);
|
||||
|
||||
// With Slides and Master Text
|
||||
text = extractor.getText(true, false, true);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(false);
|
||||
extractor.setMasterByDefault(true);
|
||||
text = extractor.getText();
|
||||
String smText =
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
@ -131,7 +133,10 @@ public class TestXSLFPowerPointExtractor {
|
||||
assertEquals(smText, text);
|
||||
|
||||
// With Slides, Notes and Master Text
|
||||
text = extractor.getText(true, true, true);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(true);
|
||||
extractor.setMasterByDefault(true);
|
||||
text = extractor.getText();
|
||||
String snmText =
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
@ -150,14 +155,14 @@ public class TestXSLFPowerPointExtractor {
|
||||
extractor.setNotesByDefault(true);
|
||||
text = extractor.getText();
|
||||
assertEquals("\n\n1\n\n\n2\n", text);
|
||||
|
||||
extractor.close();
|
||||
xmlA.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetComments() throws IOException {
|
||||
XMLSlideShow xml = openPPTX("45545_Comment.pptx");
|
||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
||||
try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||
extractor.setCommentsByDefault(true);
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
@ -168,18 +173,19 @@ public class TestXSLFPowerPointExtractor {
|
||||
|
||||
// Check the authors came through too
|
||||
assertContains(text, "XPVMWARE01");
|
||||
|
||||
extractor.close();
|
||||
xml.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore("currently slidelayouts aren't yet supported")
|
||||
public void testGetMasterText() throws Exception {
|
||||
XMLSlideShow xml = openPPTX("WithMaster.pptx");
|
||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
||||
try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(false);
|
||||
extractor.setMasterByDefault(true);
|
||||
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
@ -208,24 +214,20 @@ public class TestXSLFPowerPointExtractor {
|
||||
"This is the Master Title\n" +
|
||||
"This text comes from the Master Slide\n";
|
||||
assertEquals(wholeText, text);
|
||||
|
||||
extractor.close();
|
||||
xml.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTable() throws Exception {
|
||||
XMLSlideShow xml = openPPTX("present1.pptx");
|
||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
||||
try (XMLSlideShow xml = openPPTX("present1.pptx");
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
// Check comments are there
|
||||
assertContains(text, "TEST");
|
||||
|
||||
extractor.close();
|
||||
xml.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -241,8 +243,9 @@ public class TestXSLFPowerPointExtractor {
|
||||
};
|
||||
for(String extension : extensions) {
|
||||
String filename = "testPPT." + extension;
|
||||
XMLSlideShow xml = openPPTX(filename);
|
||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
||||
|
||||
try (XMLSlideShow xml = openPPTX(filename);
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||
|
||||
String text = extractor.getText();
|
||||
if (extension.equals("thmx")) {
|
||||
@ -257,58 +260,59 @@ public class TestXSLFPowerPointExtractor {
|
||||
assertContains(filename, text, "content parsing");
|
||||
assertContains(filename, text, "Different words to test against");
|
||||
assertContains(filename, text, "Mystery");
|
||||
|
||||
extractor.close();
|
||||
xml.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test45541() throws Exception {
|
||||
public void test45541() throws IOException, OpenXML4JException, XmlException {
|
||||
// extract text from a powerpoint that has a header in the notes-element
|
||||
POITextExtractor extr = ExtractorFactory.createExtractor(
|
||||
slTests.getFile("45541_Header.pptx"));
|
||||
final File headerFile = slTests.getFile("45541_Header.pptx");
|
||||
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
|
||||
String text = extr.getText();
|
||||
assertNotNull(text);
|
||||
assertFalse("Had: " + text, text.contains("testdoc"));
|
||||
|
||||
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
|
||||
extr.setSlidesByDefault(false);
|
||||
extr.setNotesByDefault(true);
|
||||
|
||||
text = extr.getText();
|
||||
assertContains(text, "testdoc");
|
||||
extr.close();
|
||||
assertNotNull(text);
|
||||
}
|
||||
|
||||
// extract text from a powerpoint that has a footer in the master-slide
|
||||
extr = ExtractorFactory.createExtractor(
|
||||
slTests.getFile("45541_Footer.pptx"));
|
||||
final File footerFile = slTests.getFile("45541_Footer.pptx");
|
||||
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
|
||||
String text = extr.getText();
|
||||
assertNotContained(text, "testdoc");
|
||||
|
||||
extr.setSlidesByDefault(false);
|
||||
extr.setNotesByDefault(true);
|
||||
text = extr.getText();
|
||||
assertNotContained(text, "testdoc");
|
||||
|
||||
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
|
||||
extr.setSlidesByDefault(false);
|
||||
extr.setNotesByDefault(false);
|
||||
extr.setMasterByDefault(true);
|
||||
text = extr.getText();
|
||||
assertNotContained(text, "testdoc");
|
||||
|
||||
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
|
||||
assertNotContained(text, "testdoc");
|
||||
|
||||
extr.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void bug54570() throws IOException {
|
||||
XMLSlideShow xml = openPPTX("bug54570.pptx");
|
||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
||||
try (XMLSlideShow xml = openPPTX("bug54570.pptx");
|
||||
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||
String text = extractor.getText();
|
||||
assertNotNull(text);
|
||||
extractor.close();
|
||||
xml.close();
|
||||
}
|
||||
}
|
||||
|
||||
private XMLSlideShow openPPTX(String file) throws IOException {
|
||||
InputStream is = slTests.openResourceAsStream(file);
|
||||
try {
|
||||
try (InputStream is = slTests.openResourceAsStream(file)) {
|
||||
return new XMLSlideShow(is);
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
|
||||
/**
|
||||
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
|
||||
@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
|
||||
return new PowerPointExtractor(poifsDir);
|
||||
return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("VisioDocument")) {
|
||||
|
@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.Removal;
|
||||
|
||||
/**
|
||||
* This class can be used to extract text from a PowerPoint file. Can optionally
|
||||
@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
@Deprecated
|
||||
@Removal(version="5.0.0")
|
||||
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
|
||||
|
||||
|
@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
|
||||
public void close() throws IOException {
|
||||
_hslfSlideShow.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getPersistDocument() {
|
||||
return getSlideShowImpl();
|
||||
}
|
||||
}
|
||||
|
@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.sl.usermodel.SlideShow;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.Internal;
|
||||
|
||||
@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
|
||||
@Internal
|
||||
public class HSLFSlideShowFactory extends SlideShowFactory {
|
||||
/**
|
||||
* Creates a HSLFSlideShow from the given NPOIFSFileSystem
|
||||
* <p>Note that in order to properly release resources the
|
||||
* Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
|
||||
* Note that in order to properly release resources the
|
||||
* SlideShow should be closed after use.
|
||||
*/
|
||||
public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
|
||||
public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
|
||||
return new HSLFSlideShow(fs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HSLFSlideShow from the given DirectoryNode<p>
|
||||
* Note that in order to properly release resources the
|
||||
* SlideShow should be closed after use.
|
||||
*/
|
||||
public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
|
||||
return new HSLFSlideShow(root);
|
||||
}
|
||||
}
|
||||
|
@ -846,11 +846,15 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
// only close the filesystem, if we are based on the root node.
|
||||
// embedded documents/slideshows shouldn't close the parent container
|
||||
if (getDirectory().getParent() == null) {
|
||||
NPOIFSFileSystem fs = getDirectory().getFileSystem();
|
||||
if (fs != null) {
|
||||
fs.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getEncryptedPropertyStreamName() {
|
||||
|
@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.ObjectShape;
|
||||
import org.apache.poi.sl.usermodel.SlideShow;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
@ -76,43 +80,46 @@ public final class TestExtractor {
|
||||
// ppe.close();
|
||||
// }
|
||||
|
||||
private PowerPointExtractor openExtractor(String fileName) throws IOException {
|
||||
InputStream is = slTests.openResourceAsStream(fileName);
|
||||
try {
|
||||
return new PowerPointExtractor(is);
|
||||
} finally {
|
||||
is.close();
|
||||
private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
|
||||
try (InputStream is = slTests.openResourceAsStream(fileName)) {
|
||||
return new SlideShowExtractor(SlideShowFactory.create(is));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadSheetText() throws IOException {
|
||||
// Basic 2 page example
|
||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
||||
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||
assertEquals(expectText, ppe.getText());
|
||||
ppe.close();
|
||||
}
|
||||
|
||||
// 1 page example with text boxes
|
||||
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
||||
assertEquals(expectText2, ppe2.getText());
|
||||
ppe2.close();
|
||||
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
|
||||
assertEquals(expectText2, ppe.getText());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadNoteText() throws IOException {
|
||||
// Basic 2 page example
|
||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
||||
String notesText = ppe.getNotes();
|
||||
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||
ppe.setNotesByDefault(true);
|
||||
ppe.setSlidesByDefault(false);
|
||||
ppe.setMasterByDefault(false);
|
||||
String notesText = ppe.getText();
|
||||
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
|
||||
assertEquals(expText, notesText);
|
||||
ppe.close();
|
||||
}
|
||||
|
||||
// Other one doesn't have notes
|
||||
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
||||
notesText = ppe2.getNotes();
|
||||
expText = "";
|
||||
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
|
||||
ppe.setNotesByDefault(true);
|
||||
ppe.setSlidesByDefault(false);
|
||||
ppe.setMasterByDefault(false);
|
||||
String notesText = ppe.getText();
|
||||
String expText = "";
|
||||
assertEquals(expText, notesText);
|
||||
ppe2.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -126,7 +133,7 @@ public final class TestExtractor {
|
||||
"\nThese are the notes on page two, again lacking formatting\n"
|
||||
};
|
||||
|
||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
||||
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||
ppe.setSlidesByDefault(true);
|
||||
ppe.setNotesByDefault(false);
|
||||
assertEquals(slText[0] + slText[1], ppe.getText());
|
||||
@ -138,7 +145,7 @@ public final class TestExtractor {
|
||||
ppe.setSlidesByDefault(true);
|
||||
ppe.setNotesByDefault(true);
|
||||
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -149,10 +156,13 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void testMissingCoreRecords() throws IOException {
|
||||
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
|
||||
|
||||
String text = ppe.getText(true, false);
|
||||
String nText = ppe.getNotes();
|
||||
try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
|
||||
ppe.setSlidesByDefault(true);
|
||||
ppe.setNotesByDefault(false);
|
||||
String text = ppe.getText();
|
||||
ppe.setSlidesByDefault(false);
|
||||
ppe.setNotesByDefault(true);
|
||||
String nText = ppe.getText();
|
||||
|
||||
assertNotNull(text);
|
||||
assertNotNull(nText);
|
||||
@ -162,32 +172,30 @@ public final class TestExtractor {
|
||||
|
||||
// Slide records were fine
|
||||
assertContains(text, "Using Disease Surveillance and Response");
|
||||
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractFromEmbeded() throws IOException {
|
||||
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
|
||||
POIFSFileSystem fs = new POIFSFileSystem(is);
|
||||
DirectoryNode root = fs.getRoot();
|
||||
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
|
||||
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
|
||||
ppe2.close();
|
||||
ppe1.close();
|
||||
fs.close();
|
||||
}
|
||||
try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
|
||||
final POIFSFileSystem fs = new POIFSFileSystem(is)) {
|
||||
final DirectoryNode root = fs.getRoot();
|
||||
|
||||
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
|
||||
throws IOException {
|
||||
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
|
||||
final String[] TEST_SET = {
|
||||
"MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
|
||||
"MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
|
||||
};
|
||||
|
||||
for (int i=0; i<TEST_SET.length; i+=2) {
|
||||
DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
|
||||
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
|
||||
|
||||
// Check the first file
|
||||
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
|
||||
PowerPointExtractor ppe = new PowerPointExtractor(ppt);
|
||||
assertEquals(expected, ppe.getText(true, false));
|
||||
return ppe;
|
||||
try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
|
||||
final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
|
||||
assertEquals(TEST_SET[i+1], ppe.getText());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -195,12 +203,12 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void testExtractFromOwnEmbeded() throws IOException {
|
||||
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
|
||||
List<HSLFObjectShape> shapes = ppe.getOLEShapes();
|
||||
try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
|
||||
List<? extends ObjectShape> shapes = ppe.getOLEShapes();
|
||||
assertEquals("Expected 6 ole shapes", 6, shapes.size());
|
||||
int num_ppt = 0, num_doc = 0, num_xls = 0;
|
||||
for (HSLFObjectShape ole : shapes) {
|
||||
String name = ole.getInstanceName();
|
||||
for (ObjectShape ole : shapes) {
|
||||
String name = ((HSLFObjectShape)ole).getInstanceName();
|
||||
InputStream data = ole.getObjectData().getInputStream();
|
||||
if ("Worksheet".equals(name)) {
|
||||
HSSFWorkbook wb = new HSSFWorkbook(data);
|
||||
@ -220,7 +228,7 @@ public final class TestExtractor {
|
||||
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
|
||||
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
|
||||
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -228,11 +236,11 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void test52991() throws IOException {
|
||||
PowerPointExtractor ppe = openExtractor("badzip.ppt");
|
||||
for (HSLFObjectShape shape : ppe.getOLEShapes()) {
|
||||
try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
|
||||
for (ObjectShape shape : ppe.getOLEShapes()) {
|
||||
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
|
||||
}
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -240,27 +248,27 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void testWithComments() throws IOException {
|
||||
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
|
||||
String text = ppe1.getText();
|
||||
try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
|
||||
String text = ppe.getText();
|
||||
assertFalse("Comments not in by default", text.contains("This is a test comment"));
|
||||
|
||||
ppe1.setCommentsByDefault(true);
|
||||
ppe.setCommentsByDefault(true);
|
||||
|
||||
text = ppe1.getText();
|
||||
text = ppe.getText();
|
||||
assertContains(text, "This is a test comment");
|
||||
ppe1.close();
|
||||
}
|
||||
|
||||
|
||||
// And another file
|
||||
PowerPointExtractor ppe2 = openExtractor("45543.ppt");
|
||||
text = ppe2.getText();
|
||||
try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
|
||||
String text = ppe.getText();
|
||||
assertFalse("Comments not in by default", text.contains("testdoc"));
|
||||
|
||||
ppe2.setCommentsByDefault(true);
|
||||
ppe.setCommentsByDefault(true);
|
||||
|
||||
text = ppe2.getText();
|
||||
text = ppe.getText();
|
||||
assertContains(text, "testdoc");
|
||||
ppe2.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -268,48 +276,37 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void testHeaderFooter() throws IOException {
|
||||
String text;
|
||||
|
||||
// With a header on the notes
|
||||
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
|
||||
HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
|
||||
is1.close();
|
||||
assertNotNull(ppt1.getNotesHeadersFooters());
|
||||
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
|
||||
try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
|
||||
HSLFSlideShow ppt = new HSLFSlideShow(is)) {
|
||||
|
||||
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
|
||||
assertNotNull(ppt.getNotesHeadersFooters());
|
||||
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
|
||||
|
||||
text = ppe1.getText();
|
||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
||||
|
||||
ppe1.setNotesByDefault(true);
|
||||
text = ppe1.getText();
|
||||
assertContains(text, "testdoc");
|
||||
assertContains(text, "test phrase");
|
||||
ppe1.close();
|
||||
ppt1.close();
|
||||
testHeaderFooterInner(ppt);
|
||||
}
|
||||
|
||||
// And with a footer, also on notes
|
||||
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
|
||||
HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
|
||||
is2.close();
|
||||
try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
|
||||
final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
|
||||
assertNotNull(ppt.getNotesHeadersFooters());
|
||||
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
|
||||
|
||||
assertNotNull(ppt2.getNotesHeadersFooters());
|
||||
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
|
||||
ppt2.close();
|
||||
testHeaderFooterInner(ppt);
|
||||
}
|
||||
}
|
||||
|
||||
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
|
||||
|
||||
text = ppe2.getText();
|
||||
private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
|
||||
try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
|
||||
String text = ppe.getText();
|
||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
||||
|
||||
ppe2.setNotesByDefault(true);
|
||||
text = ppe2.getText();
|
||||
ppe.setNotesByDefault(true);
|
||||
text = ppe.getText();
|
||||
assertContains(text, "testdoc");
|
||||
assertContains(text, "test phrase");
|
||||
ppe2.close();
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
@ -318,41 +315,40 @@ public final class TestExtractor {
|
||||
String masterTitleText = "This is the Master Title";
|
||||
String masterRandomText = "This text comes from the Master Slide";
|
||||
String masterFooterText = "Footer from the master slide";
|
||||
PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
|
||||
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
|
||||
ppe.setMasterByDefault(true);
|
||||
|
||||
String text = ppe.getText();
|
||||
assertContains(text, masterRandomText);
|
||||
assertContains(text, masterFooterText);
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMasterText() throws IOException {
|
||||
PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
|
||||
|
||||
try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
|
||||
// Initially not there
|
||||
String text = ppe1.getText();
|
||||
String text = ppe.getText();
|
||||
assertFalse(text.contains("Text that I added to the master slide"));
|
||||
|
||||
// Enable, shows up
|
||||
ppe1.setMasterByDefault(true);
|
||||
text = ppe1.getText();
|
||||
ppe.setMasterByDefault(true);
|
||||
text = ppe.getText();
|
||||
assertContains(text, "Text that I added to the master slide");
|
||||
|
||||
// Make sure placeholder text does not come out
|
||||
assertNotContained(text, "Click to edit Master");
|
||||
ppe1.close();
|
||||
}
|
||||
|
||||
// Now with another file only containing master text
|
||||
// Will always show up
|
||||
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
|
||||
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
|
||||
String masterText = "Footer from the master slide";
|
||||
|
||||
text = ppe2.getText();
|
||||
String text = ppe.getText();
|
||||
assertContainsIgnoreCase(text, "master");
|
||||
assertContains(text, masterText);
|
||||
ppe2.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -360,8 +356,7 @@ public final class TestExtractor {
|
||||
*/
|
||||
@Test
|
||||
public void testChineseText() throws IOException {
|
||||
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
|
||||
|
||||
try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
|
||||
String text = ppe.getText();
|
||||
|
||||
// Check for the english text line
|
||||
@ -375,7 +370,7 @@ public final class TestExtractor {
|
||||
|
||||
// Check for the chinese only text line
|
||||
assertContains(text, "\uff8a\uff9d\uff76\uff78");
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -387,67 +382,59 @@ public final class TestExtractor {
|
||||
public void testDifferentPOIFS() throws IOException {
|
||||
// Open the two filesystems
|
||||
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
|
||||
InputStream is1 = new FileInputStream(pptFile);
|
||||
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
|
||||
is1.close();
|
||||
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
|
||||
try (final InputStream is1 = new FileInputStream(pptFile);
|
||||
final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
|
||||
|
||||
DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
|
||||
final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
|
||||
|
||||
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
|
||||
|
||||
// Open directly
|
||||
for (DirectoryNode dir : files) {
|
||||
PowerPointExtractor extractor = new PowerPointExtractor(dir);
|
||||
try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
|
||||
SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
|
||||
assertEquals(expectText, extractor.getText());
|
||||
}
|
||||
|
||||
// Open via a HSLFSlideShow
|
||||
for (DirectoryNode dir : files) {
|
||||
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
|
||||
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
|
||||
assertEquals(expectText, extractor.getText());
|
||||
extractor.close();
|
||||
slideshow.close();
|
||||
}
|
||||
|
||||
npoifs.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTable() throws Exception {
|
||||
PowerPointExtractor ppe1 = openExtractor("54111.ppt");
|
||||
String text1 = ppe1.getText();
|
||||
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
|
||||
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
|
||||
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
|
||||
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
|
||||
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
|
||||
try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
|
||||
String text = ppe.getText();
|
||||
String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
|
||||
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
|
||||
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
|
||||
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
|
||||
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
|
||||
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
|
||||
assertContains(text1, target1);
|
||||
ppe1.close();
|
||||
assertContains(text, target);
|
||||
}
|
||||
|
||||
PowerPointExtractor ppe2 = openExtractor("54722.ppt");
|
||||
String text2 = ppe2.getText();
|
||||
try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
|
||||
String text = ppe.getText();
|
||||
|
||||
String target2 = "this\tText\tis\twithin\ta\n" +
|
||||
String target = "this\tText\tis\twithin\ta\n" +
|
||||
"table\t1\t2\t3\t4";
|
||||
assertContains(text2, target2);
|
||||
ppe2.close();
|
||||
assertContains(text, target);
|
||||
}
|
||||
}
|
||||
|
||||
// bug 60003
|
||||
@Test
|
||||
public void testExtractMasterSlideFooterText() throws Exception {
|
||||
PowerPointExtractor ppe = openExtractor("60003.ppt");
|
||||
try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
|
||||
ppe.setMasterByDefault(true);
|
||||
|
||||
String text = ppe.getText();
|
||||
assertContains(text, "Prague");
|
||||
ppe.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtractGroupedShapeText() throws Exception {
|
||||
try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
|
||||
try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
|
||||
final String text = ppe.getText();
|
||||
|
||||
//this tests that we're ignoring text shapes at depth=0
|
||||
|
@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
|
||||
import org.apache.poi.sl.draw.DrawFactory;
|
||||
import org.apache.poi.sl.draw.DrawPaint;
|
||||
import org.apache.poi.sl.draw.DrawTextParagraph;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.ColorStyle;
|
||||
import org.apache.poi.sl.usermodel.PaintStyle;
|
||||
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
||||
@ -800,18 +801,18 @@ public final class TestBugs {
|
||||
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
|
||||
for (String f : files) {
|
||||
File sample = HSLFTestDataSamples.getSampleFile(f);
|
||||
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
|
||||
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
|
||||
assertNotNull(ex.getText());
|
||||
ex.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void bug58733() throws IOException {
|
||||
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
|
||||
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
|
||||
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
|
||||
assertNotNull(ex.getText());
|
||||
ex.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user