#62319 - Decommission XSLF-/PowerPointExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bc436fcc3d
commit
ab390ce170
@ -330,8 +330,6 @@ public class TestAllFiles {
|
|||||||
);
|
);
|
||||||
|
|
||||||
private static final Set<String> IGNORED = unmodifiableHashSet(
|
private static final Set<String> IGNORED = unmodifiableHashSet(
|
||||||
// need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
|
|
||||||
"slideshow/42474-2.ppt",
|
|
||||||
// OPC handler works / XSSF handler fails
|
// OPC handler works / XSSF handler fails
|
||||||
"spreadsheet/57181.xlsm",
|
"spreadsheet/57181.xlsm",
|
||||||
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop
|
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop
|
||||||
|
@ -24,6 +24,7 @@ import java.io.FileInputStream;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
import org.apache.poi.extractor.ExtractorFactory;
|
import org.apache.poi.extractor.ExtractorFactory;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||||
@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
|
|||||||
|
|
||||||
// additionally try the other getText() methods
|
// additionally try the other getText() methods
|
||||||
|
|
||||||
try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
|
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
|
||||||
assertNotNull(extractor);
|
assertNotNull(extractor);
|
||||||
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(true);
|
||||||
|
extractor.setMasterByDefault(true);
|
||||||
|
|
||||||
assertNotNull(extractor.getText(true, true, true));
|
assertNotNull(extractor.getText());
|
||||||
assertEquals("With all options disabled we should not get text",
|
|
||||||
"", extractor.getText(false, false, false));
|
extractor.setSlidesByDefault(false);
|
||||||
|
extractor.setNotesByDefault(false);
|
||||||
|
extractor.setMasterByDefault(false);
|
||||||
|
|
||||||
|
assertEquals("With all options disabled we should not get text", "", extractor.getText());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||||||
*
|
*
|
||||||
* @return the underlying POIDocument
|
* @return the underlying POIDocument
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public POIDocument getDocument() {
|
public POIDocument getDocument() {
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
|
|||||||
fsToClose.close();
|
fsToClose.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the processed document
|
||||||
|
*/
|
||||||
|
public abstract Object getDocument();
|
||||||
}
|
}
|
||||||
|
@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
|
|||||||
return threadPreferEventExtractors.get();
|
return threadPreferEventExtractors.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return (T)createExtractor(fs.getRoot());
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
|
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return (T)createExtractor(fs.getRoot());
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
|
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return (T)createExtractor(fs.getRoot());
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(InputStream input) throws IOException {
|
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
|
||||||
Class<?> cls = getOOXMLClass();
|
Class<?> cls = getOOXMLClass();
|
||||||
if (cls != null) {
|
if (cls != null) {
|
||||||
// Use Reflection to get us the full OOXML-enabled version
|
// Use Reflection to get us the full OOXML-enabled version
|
||||||
try {
|
try {
|
||||||
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
|
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
|
||||||
return (POITextExtractor)m.invoke(null, input);
|
return (T)m.invoke(null, input);
|
||||||
} catch (IllegalArgumentException iae) {
|
} catch (IllegalArgumentException iae) {
|
||||||
throw iae;
|
throw iae;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -45,7 +45,29 @@ public class DocumentFactoryHelper {
|
|||||||
*/
|
*/
|
||||||
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
|
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
EncryptionInfo info = new EncryptionInfo(fs);
|
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
|
||||||
|
// as well when the resulting OPCPackage is closed
|
||||||
|
return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
fs.close();
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
|
||||||
|
* the given password.
|
||||||
|
*
|
||||||
|
* @param root The OLE2 directory node for the document
|
||||||
|
* @param password The password, null if the default password should be used
|
||||||
|
* @return A stream for reading the decrypted data
|
||||||
|
* @throws IOException If an error occurs while decrypting or if the password does not match
|
||||||
|
*/
|
||||||
|
public static InputStream getDecryptedStream(final DirectoryNode root, String password)
|
||||||
|
throws IOException {
|
||||||
|
EncryptionInfo info = new EncryptionInfo(root);
|
||||||
Decryptor d = Decryptor.getInstance(info);
|
Decryptor d = Decryptor.getInstance(info);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -58,20 +80,10 @@ public class DocumentFactoryHelper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (passwordCorrect) {
|
if (passwordCorrect) {
|
||||||
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
|
return d.getDataStream(root);
|
||||||
// as well when the resulting OPCPackage is closed
|
} else if (password != null) {
|
||||||
return new FilterInputStream(d.getDataStream(fs.getRoot())) {
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
fs.close();
|
|
||||||
|
|
||||||
super.close();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
if (password != null)
|
|
||||||
throw new EncryptedDocumentException("Password incorrect");
|
throw new EncryptedDocumentException("Password incorrect");
|
||||||
else
|
} else {
|
||||||
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
|
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
|
||||||
}
|
}
|
||||||
} catch (GeneralSecurityException e) {
|
} catch (GeneralSecurityException e) {
|
||||||
|
@ -1,3 +1,20 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
package org.apache.poi.sl.extractor;
|
package org.apache.poi.sl.extractor;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -48,6 +65,16 @@ public class SlideShowExtractor<
|
|||||||
this.slideshow = slideshow;
|
this.slideshow = slideshow;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns opened document
|
||||||
|
*
|
||||||
|
* @return the opened document
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public final Object getDocument() {
|
||||||
|
return slideshow.getPersistDocument();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should a call to getText() return slide text? Default is yes
|
* Should a call to getText() return slide text? Default is yes
|
||||||
*/
|
*/
|
||||||
@ -219,7 +246,6 @@ public class SlideShowExtractor<
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (final P para : paraList) {
|
for (final P para : paraList) {
|
||||||
final int oldLen = sb.length();
|
|
||||||
for (final TextRun tr : para) {
|
for (final TextRun tr : para) {
|
||||||
final String str = tr.getRawText().replace("\r", "");
|
final String str = tr.getRawText().replace("\r", "");
|
||||||
final String newStr;
|
final String newStr;
|
||||||
|
@ -126,4 +126,13 @@ public interface SlideShow<
|
|||||||
* @since POI 4.0.0
|
* @since POI 4.0.0
|
||||||
*/
|
*/
|
||||||
POITextExtractor getMetadataTextExtractor();
|
POITextExtractor getMetadataTextExtractor();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the instance which handles the persisting of the slideshow,
|
||||||
|
* which is either a subclass of {@link org.apache.poi.POIDocument}
|
||||||
|
* or {@link org.apache.poi.POIXMLDocument}
|
||||||
|
*
|
||||||
|
* @since POI 4.0.0
|
||||||
|
*/
|
||||||
|
Object getPersistDocument();
|
||||||
}
|
}
|
||||||
|
@ -60,13 +60,40 @@ public class SlideShowFactory {
|
|||||||
* @throws IOException if an error occurs while reading the data
|
* @throws IOException if an error occurs while reading the data
|
||||||
*/
|
*/
|
||||||
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
|
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
|
||||||
DirectoryNode root = fs.getRoot();
|
return create(fs.getRoot(), password);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a SlideShow from the given NPOIFSFileSystem.
|
||||||
|
*
|
||||||
|
* @param root The {@link DirectoryNode} to start reading the document from
|
||||||
|
*
|
||||||
|
* @return The created SlideShow
|
||||||
|
*
|
||||||
|
* @throws IOException if an error occurs while reading the data
|
||||||
|
*/
|
||||||
|
public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
|
||||||
|
return create(root, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a SlideShow from the given NPOIFSFileSystem, which may
|
||||||
|
* be password protected
|
||||||
|
*
|
||||||
|
* @param root The {@link DirectoryNode} to start reading the document from
|
||||||
|
* @param password The password that should be used or null if no password is necessary.
|
||||||
|
*
|
||||||
|
* @return The created SlideShow
|
||||||
|
*
|
||||||
|
* @throws IOException if an error occurs while reading the data
|
||||||
|
*/
|
||||||
|
public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
|
||||||
// Encrypted OOXML files go inside OLE2 containers, is this one?
|
// Encrypted OOXML files go inside OLE2 containers, is this one?
|
||||||
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||||
InputStream stream = null;
|
InputStream stream = null;
|
||||||
try {
|
try {
|
||||||
stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
|
stream = DocumentFactoryHelper.getDecryptedStream(root, password);
|
||||||
|
|
||||||
return createXSLFSlideShow(stream);
|
return createXSLFSlideShow(stream);
|
||||||
} finally {
|
} finally {
|
||||||
@ -82,7 +109,7 @@ public class SlideShowFactory {
|
|||||||
passwordSet = true;
|
passwordSet = true;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
return createHSLFSlideShow(fs);
|
return createHSLFSlideShow(root);
|
||||||
} finally {
|
} finally {
|
||||||
if (passwordSet) {
|
if (passwordSet) {
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(null);
|
Biff8EncryptionKey.setCurrentUserPassword(null);
|
||||||
|
@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||||||
*
|
*
|
||||||
* @return the opened document
|
* @return the opened document
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public final POIXMLDocument getDocument() {
|
public final POIXMLDocument getDocument() {
|
||||||
return _document;
|
return _document;
|
||||||
}
|
}
|
||||||
|
@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
|
|||||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
|
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.util.IOUtils;
|
import org.apache.poi.util.IOUtils;
|
||||||
import org.apache.poi.util.NotImplemented;
|
import org.apache.poi.util.NotImplemented;
|
||||||
import org.apache.poi.util.POILogFactory;
|
import org.apache.poi.util.POILogFactory;
|
||||||
@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
|
|||||||
import org.apache.poi.util.Removal;
|
import org.apache.poi.util.Removal;
|
||||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||||
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||||
@ -127,20 +129,20 @@ public class ExtractorFactory {
|
|||||||
return OLE2ExtractorFactory.getPreferEventExtractor();
|
return OLE2ExtractorFactory.getPreferEventExtractor();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
|
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
|
||||||
NPOIFSFileSystem fs = null;
|
NPOIFSFileSystem fs = null;
|
||||||
try {
|
try {
|
||||||
fs = new NPOIFSFileSystem(f);
|
fs = new NPOIFSFileSystem(f);
|
||||||
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||||
return createEncryptedOOXMLExtractor(fs);
|
return (T)createEncryptedOOXMLExtractor(fs);
|
||||||
}
|
}
|
||||||
POIOLE2TextExtractor extractor = createExtractor(fs);
|
POITextExtractor extractor = createExtractor(fs);
|
||||||
extractor.setFilesystem(fs);
|
extractor.setFilesystem(fs);
|
||||||
return extractor;
|
return (T)extractor;
|
||||||
} catch (OfficeXmlFileException e) {
|
} catch (OfficeXmlFileException e) {
|
||||||
// ensure file-handle release
|
// ensure file-handle release
|
||||||
IOUtils.closeQuietly(fs);
|
IOUtils.closeQuietly(fs);
|
||||||
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
|
||||||
} catch (NotOLE2FileException ne) {
|
} catch (NotOLE2FileException ne) {
|
||||||
// ensure file-handle release
|
// ensure file-handle release
|
||||||
IOUtils.closeQuietly(fs);
|
IOUtils.closeQuietly(fs);
|
||||||
@ -179,7 +181,7 @@ public class ExtractorFactory {
|
|||||||
* @throws XmlException If an XML parsing error occurs.
|
* @throws XmlException If an XML parsing error occurs.
|
||||||
* @throws IllegalArgumentException If no matching file type could be found.
|
* @throws IllegalArgumentException If no matching file type could be found.
|
||||||
*/
|
*/
|
||||||
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||||
try {
|
try {
|
||||||
// Check for the normal Office core document
|
// Check for the normal Office core document
|
||||||
PackageRelationshipCollection core;
|
PackageRelationshipCollection core;
|
||||||
@ -226,13 +228,13 @@ public class ExtractorFactory {
|
|||||||
// Is it XSLF?
|
// Is it XSLF?
|
||||||
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||||
if ( rel.getContentType().equals( contentType ) ) {
|
if ( rel.getContentType().equals( contentType ) ) {
|
||||||
return new XSLFPowerPointExtractor(pkg);
|
return new SlideShowExtractor(new XMLSlideShow(pkg));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// special handling for SlideShow-Theme-files,
|
// special handling for SlideShow-Theme-files,
|
||||||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
|
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
|
||||||
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
|
return new SlideShowExtractor(new XMLSlideShow(pkg));
|
||||||
}
|
}
|
||||||
|
|
||||||
// How about xlsb?
|
// How about xlsb?
|
||||||
@ -252,28 +254,28 @@ public class ExtractorFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
return OLE2ExtractorFactory.createExtractor(fs);
|
return createExtractor(fs.getRoot());
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
return OLE2ExtractorFactory.createExtractor(fs);
|
return createExtractor(fs.getRoot());
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
return OLE2ExtractorFactory.createExtractor(fs);
|
return createExtractor(fs.getRoot());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
|
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
|
||||||
{
|
{
|
||||||
// First, check for OOXML
|
// First, check for OOXML
|
||||||
for (String entryName : poifsDir.getEntryNames()) {
|
for (String entryName : poifsDir.getEntryNames()) {
|
||||||
if (entryName.equals("Package")) {
|
if (entryName.equals("Package")) {
|
||||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||||
return createExtractor(pkg);
|
return (T)createExtractor(pkg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
||||||
return OLE2ExtractorFactory.createExtractor(poifsDir);
|
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -403,7 +405,7 @@ public class ExtractorFactory {
|
|||||||
throw new IllegalStateException("Not yet supported");
|
throw new IllegalStateException("Not yet supported");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
|
private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
String pass = Biff8EncryptionKey.getCurrentUserPassword();
|
String pass = Biff8EncryptionKey.getCurrentUserPassword();
|
||||||
if (pass == null) {
|
if (pass == null) {
|
||||||
|
@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
|
|||||||
* @deprecated use {@link SlideShowExtractor}
|
* @deprecated use {@link SlideShowExtractor}
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
@Removal(version="4.2.0")
|
@Removal(version="5.0.0")
|
||||||
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
|
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
|
||||||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
|
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
|
||||||
|
@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
|
|||||||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||||
return new POIXMLPropertiesTextExtractor(this);
|
return new POIXMLPropertiesTextExtractor(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getPersistDocument() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,20 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
package org.apache.poi.xslf.usermodel;
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;
|
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;
|
||||||
|
@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
|
|||||||
*/
|
*/
|
||||||
public XSLFCommentAuthors getCommentAuthorsPart() {
|
public XSLFCommentAuthors getCommentAuthorsPart() {
|
||||||
if(_commentAuthors == null) {
|
if(_commentAuthors == null) {
|
||||||
|
// first scan the slide relations
|
||||||
for (POIXMLDocumentPart p : getRelations()) {
|
for (POIXMLDocumentPart p : getRelations()) {
|
||||||
if (p instanceof XSLFCommentAuthors) {
|
if (p instanceof XSLFCommentAuthors) {
|
||||||
_commentAuthors = (XSLFCommentAuthors)p;
|
_commentAuthors = (XSLFCommentAuthors)p;
|
||||||
return _commentAuthors;
|
return _commentAuthors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// then scan the presentation relations
|
||||||
|
for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
|
||||||
|
if (p instanceof XSLFCommentAuthors) {
|
||||||
|
_commentAuthors = (XSLFCommentAuthors)p;
|
||||||
|
return _commentAuthors;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
@ -27,16 +27,15 @@ import static org.junit.Assert.fail;
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
import org.apache.poi.POIXMLException;
|
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.UnsupportedFileFormatException;
|
import org.apache.poi.UnsupportedFileFormatException;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
|
||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||||
import org.apache.poi.hssf.OldExcelFormatException;
|
import org.apache.poi.hssf.OldExcelFormatException;
|
||||||
@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
|||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.util.POILogFactory;
|
import org.apache.poi.util.POILogFactory;
|
||||||
import org.apache.poi.util.POILogger;
|
import org.apache.poi.util.POILogger;
|
||||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||||
import org.junit.BeforeClass;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -65,34 +66,39 @@ public class TestExtractorFactory {
|
|||||||
|
|
||||||
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
|
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
|
||||||
|
|
||||||
private static File txt;
|
private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
||||||
|
private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
|
||||||
|
private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
|
||||||
|
private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
|
||||||
|
private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
|
||||||
|
private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
|
||||||
|
private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
|
||||||
|
|
||||||
private static File xls;
|
private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||||
private static File xlsx;
|
private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
|
||||||
private static File xlsxStrict;
|
private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
|
||||||
private static File xltx;
|
private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
|
||||||
private static File xlsEmb;
|
private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
|
||||||
private static File xlsb;
|
private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
|
||||||
|
private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
|
||||||
|
private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
|
||||||
|
|
||||||
private static File doc;
|
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||||
private static File doc6;
|
private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
|
||||||
private static File doc95;
|
private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
|
||||||
private static File docx;
|
private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
|
||||||
private static File dotx;
|
|
||||||
private static File docEmb;
|
|
||||||
private static File docEmbOOXML;
|
|
||||||
|
|
||||||
private static File ppt;
|
private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||||
private static File pptx;
|
private static final File msg = getFileAndCheck(olTests, "quick.msg");
|
||||||
|
private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
|
||||||
|
private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
|
||||||
|
|
||||||
private static File msg;
|
private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||||
private static File msgEmb;
|
private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
|
||||||
private static File msgEmbMsg;
|
private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
|
||||||
|
|
||||||
private static File vsd;
|
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
||||||
private static File vsdx;
|
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
|
||||||
|
|
||||||
private static File pub;
|
|
||||||
|
|
||||||
private static File getFileAndCheck(POIDataSamples samples, String name) {
|
private static File getFileAndCheck(POIDataSamples samples, String name) {
|
||||||
File file = samples.getFile(name);
|
File file = samples.getFile(name);
|
||||||
@ -104,595 +110,133 @@ public class TestExtractorFactory {
|
|||||||
return file;
|
return file;
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeClass
|
private static final Object[] TEST_SET = {
|
||||||
public static void setUp() throws Exception {
|
"Excel", xls, ExcelExtractor.class, 200,
|
||||||
|
"Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
|
||||||
|
"Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
|
||||||
|
"Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
|
||||||
|
"Word", doc, WordExtractor.class, 120,
|
||||||
|
"Word - docx", docx, XWPFWordExtractor.class, 120,
|
||||||
|
"Word - dotx", dotx, XWPFWordExtractor.class, -1,
|
||||||
|
"Word 6", doc6, Word6Extractor.class, 20,
|
||||||
|
"Word 95", doc95, Word6Extractor.class, 120,
|
||||||
|
"PowerPoint", ppt, SlideShowExtractor.class, 120,
|
||||||
|
"PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
|
||||||
|
"Visio", vsd, VisioTextExtractor.class, 50,
|
||||||
|
"Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
|
||||||
|
"Publisher", pub, PublisherTextExtractor.class, 50,
|
||||||
|
"Outlook msg", msg, OutlookTextExtactor.class, 50,
|
||||||
|
|
||||||
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
// TODO Support OOXML-Strict, see bug #57699
|
||||||
xls = getFileAndCheck(ssTests, "SampleSS.xls");
|
// xlsxStrict
|
||||||
xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
|
};
|
||||||
xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
|
|
||||||
xltx = getFileAndCheck(ssTests, "test.xltx");
|
|
||||||
xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
|
|
||||||
xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
|
|
||||||
|
|
||||||
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
@FunctionalInterface
|
||||||
doc = getFileAndCheck(wpTests, "SampleDoc.doc");
|
interface FunctionEx<T, R> {
|
||||||
doc6 = getFileAndCheck(wpTests, "Word6.doc");
|
R apply(T t) throws IOException, OpenXML4JException, XmlException;
|
||||||
doc95 = getFileAndCheck(wpTests, "Word95.doc");
|
|
||||||
docx = getFileAndCheck(wpTests, "SampleDoc.docx");
|
|
||||||
dotx = getFileAndCheck(wpTests, "test.dotx");
|
|
||||||
docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
|
|
||||||
docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
|
|
||||||
|
|
||||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
|
||||||
ppt = getFileAndCheck(slTests, "SampleShow.ppt");
|
|
||||||
pptx = getFileAndCheck(slTests, "SampleShow.pptx");
|
|
||||||
txt = getFileAndCheck(slTests, "SampleShow.txt");
|
|
||||||
|
|
||||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
|
||||||
vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
|
|
||||||
vsdx = getFileAndCheck(dgTests, "test.vsdx");
|
|
||||||
|
|
||||||
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
|
||||||
pub = getFileAndCheck(pubTests, "Simple.pub");
|
|
||||||
|
|
||||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
|
||||||
msg = getFileAndCheck(olTests, "quick.msg");
|
|
||||||
msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
|
|
||||||
msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFile() throws Exception {
|
public void testFile() throws Exception {
|
||||||
// Excel
|
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||||
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
|
try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
|
||||||
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
|
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||||
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
|
}
|
||||||
xlsExtractor
|
}
|
||||||
instanceof ExcelExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
xlsExtractor.getText().length() > 200
|
|
||||||
);
|
|
||||||
xlsExtractor.close();
|
|
||||||
|
|
||||||
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof XSSFExcelExtractor
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(xlsx);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 200
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(xltx);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof XSSFExcelExtractor
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(xlsb);
|
|
||||||
assertContains(extractor.getText(), "test");
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(xltx);
|
|
||||||
assertContains(extractor.getText(), "test");
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// TODO Support OOXML-Strict, see bug #57699
|
|
||||||
try {
|
|
||||||
/*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
|
|
||||||
fail("OOXML-Strict isn't yet supported");
|
|
||||||
} catch (POIXMLException e) {
|
|
||||||
// Expected, for now
|
|
||||||
}
|
}
|
||||||
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
|
||||||
// assertTrue(
|
|
||||||
// extractor
|
|
||||||
// instanceof XSSFExcelExtractor
|
|
||||||
// );
|
|
||||||
// extractor.close();
|
|
||||||
//
|
|
||||||
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
|
||||||
// assertTrue(
|
|
||||||
// extractor.getText().contains("test")
|
|
||||||
// );
|
|
||||||
// extractor.close();
|
|
||||||
|
|
||||||
|
|
||||||
// Word
|
|
||||||
extractor = ExtractorFactory.createExtractor(doc);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof WordExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(doc6);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 20
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(doc95);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(docx);
|
|
||||||
assertTrue(
|
|
||||||
extractor instanceof XWPFWordExtractor
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(docx);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(dotx);
|
|
||||||
assertTrue(
|
|
||||||
extractor instanceof XWPFWordExtractor
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(dotx);
|
|
||||||
assertContains(extractor.getText(), "Test");
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// PowerPoint (PPT)
|
|
||||||
extractor = ExtractorFactory.createExtractor(ppt);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof PowerPointExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// PowerPoint (PPTX)
|
|
||||||
extractor = ExtractorFactory.createExtractor(pptx);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof XSLFPowerPointExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Visio - binary
|
|
||||||
extractor = ExtractorFactory.createExtractor(vsd);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof VisioTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Visio - vsdx
|
|
||||||
extractor = ExtractorFactory.createExtractor(vsdx);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof XDGFVisioExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 20
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Publisher
|
|
||||||
extractor = ExtractorFactory.createExtractor(pub);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof PublisherTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Outlook msg
|
|
||||||
extractor = ExtractorFactory.createExtractor(msg);
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof OutlookTextExtactor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
public void testFileInvalid() throws Exception {
|
||||||
// Text
|
// Text
|
||||||
try {
|
try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
|
||||||
ExtractorFactory.createExtractor(txt);
|
|
||||||
fail("expected IllegalArgumentException");
|
|
||||||
} catch(IllegalArgumentException e) {
|
|
||||||
// Good
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInputStream() throws Exception {
|
public void testInputStream() throws Exception {
|
||||||
// Excel
|
testStream((f) -> ExtractorFactory.createExtractor(f), true);
|
||||||
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof ExcelExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 200
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof XSSFExcelExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 200
|
|
||||||
);
|
|
||||||
// TODO Support OOXML-Strict, see bug #57699
|
|
||||||
// assertTrue(
|
|
||||||
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
|
|
||||||
// instanceof XSSFExcelExtractor
|
|
||||||
// );
|
|
||||||
// assertTrue(
|
|
||||||
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
|
|
||||||
// );
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Word
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof WordExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 20
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
|
||||||
assertTrue(
|
|
||||||
extractor.getClass().getName(),
|
|
||||||
extractor
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof XWPFWordExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// PowerPoint
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof PowerPointExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof XSLFPowerPointExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 120
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Visio
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof VisioTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Visio - vsdx
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof XDGFVisioExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 20
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Publisher
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof PublisherTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Outlook msg
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
|
|
||||||
assertTrue(
|
|
||||||
extractor
|
|
||||||
instanceof OutlookTextExtactor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
extractor.getText().length() > 50
|
|
||||||
);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Text
|
|
||||||
try (FileInputStream stream = new FileInputStream(txt)) {
|
|
||||||
ExtractorFactory.createExtractor(stream);
|
|
||||||
fail("expected IllegalArgumentException");
|
|
||||||
} catch(IllegalArgumentException e) {
|
|
||||||
// Good
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(expected = IllegalArgumentException.class)
|
||||||
|
public void testInputStreamInvalid() throws Exception {
|
||||||
|
testInvalid((f) -> ExtractorFactory.createExtractor(f));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPOIFS() throws Exception {
|
public void testPOIFS() throws Exception {
|
||||||
// Excel
|
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
|
||||||
instanceof ExcelExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
|
||||||
);
|
|
||||||
|
|
||||||
// Word
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
|
|
||||||
instanceof WordExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
|
||||||
);
|
|
||||||
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
|
||||||
);
|
|
||||||
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
|
||||||
);
|
|
||||||
|
|
||||||
// PowerPoint
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
|
|
||||||
instanceof PowerPointExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
|
||||||
);
|
|
||||||
|
|
||||||
// Visio
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
|
|
||||||
instanceof VisioTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
// Publisher
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
|
|
||||||
instanceof PublisherTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
// Outlook msg
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
|
|
||||||
instanceof OutlookTextExtactor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
// Text
|
|
||||||
try {
|
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
|
||||||
fail("expected IllegalArgumentException");
|
|
||||||
} catch(IOException e) {
|
|
||||||
// Good
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(expected = IOException.class)
|
||||||
|
public void testPOIFSInvalid() throws Exception {
|
||||||
|
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOPOIFS() throws Exception {
|
public void testOPOIFS() throws Exception {
|
||||||
// Excel
|
testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
|
||||||
assertTrue(
|
}
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
|
|
||||||
instanceof ExcelExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
|
||||||
);
|
|
||||||
|
|
||||||
// Word
|
@Test(expected = IOException.class)
|
||||||
assertTrue(
|
public void testOPOIFSInvalid() throws Exception {
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
|
testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
|
||||||
instanceof WordExtractor
|
}
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
|
||||||
);
|
|
||||||
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
|
|
||||||
instanceof Word6Extractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
|
||||||
);
|
|
||||||
|
|
||||||
assertTrue(
|
private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
|
throws IOException, OpenXML4JException, XmlException {
|
||||||
instanceof Word6Extractor
|
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||||
);
|
File testFile = (File) TEST_SET[i + 1];
|
||||||
assertTrue(
|
if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
continue;
|
||||||
);
|
}
|
||||||
|
try (FileInputStream fis = new FileInputStream(testFile);
|
||||||
|
POITextExtractor ext = poifsIS.apply(fis)) {
|
||||||
|
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
fail("failed to process "+testFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// PowerPoint
|
private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
|
||||||
assertTrue(
|
assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
|
final String actual = ext.getText();
|
||||||
instanceof PowerPointExtractor
|
if (minLength == -1) {
|
||||||
);
|
assertContains(actual.toLowerCase(Locale.ROOT), "test");
|
||||||
assertTrue(
|
} else {
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
|
||||||
);
|
}
|
||||||
|
}
|
||||||
// Visio
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
|
|
||||||
instanceof VisioTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
// Publisher
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
|
|
||||||
instanceof PublisherTextExtractor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
// Outlook msg
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
|
|
||||||
instanceof OutlookTextExtactor
|
|
||||||
);
|
|
||||||
assertTrue(
|
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
|
||||||
);
|
|
||||||
|
|
||||||
|
private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
|
||||||
// Text
|
// Text
|
||||||
try {
|
try (FileInputStream fis = new FileInputStream(txt);
|
||||||
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
|
POITextExtractor te = poifs.apply(fis)) {
|
||||||
fail("expected IllegalArgumentException");
|
|
||||||
} catch(IOException e) {
|
|
||||||
// Good
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPackage() throws Exception {
|
public void testPackage() throws Exception {
|
||||||
// Excel
|
for (int i = 0; i < TEST_SET.length; i += 4) {
|
||||||
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
final File testFile = (File) TEST_SET[i + 1];
|
||||||
assertTrue(extractor instanceof XSSFExcelExtractor);
|
if (!testFile.getName().endsWith("x")) {
|
||||||
extractor.close();
|
continue;
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
|
|
||||||
assertTrue(extractor.getText().length() > 200);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Word
|
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
|
||||||
assertTrue(extractor instanceof XWPFWordExtractor);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
|
||||||
assertTrue(extractor.getText().length() > 120);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// PowerPoint
|
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
|
||||||
assertTrue(extractor instanceof XSLFPowerPointExtractor);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
|
||||||
assertTrue(extractor.getText().length() > 120);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Visio
|
|
||||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
|
|
||||||
assertTrue(extractor instanceof XDGFVisioExtractor);
|
|
||||||
assertTrue(extractor.getText().length() > 20);
|
|
||||||
extractor.close();
|
|
||||||
|
|
||||||
// Text
|
|
||||||
try {
|
|
||||||
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
|
|
||||||
fail("TestExtractorFactory.testPackage() failed on " + txt);
|
|
||||||
} catch(UnsupportedFileFormatException e) {
|
|
||||||
// Good
|
|
||||||
} catch (Exception e) {
|
|
||||||
LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
|
|
||||||
throw e;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
|
||||||
|
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
|
||||||
|
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||||
|
pkg.revert();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = UnsupportedFileFormatException.class)
|
||||||
|
public void testPackageInvalid() throws Exception {
|
||||||
|
// Text
|
||||||
|
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
|
||||||
|
final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -781,142 +325,49 @@ public class TestExtractorFactory {
|
|||||||
* does poifs embedded, but will do ooxml ones
|
* does poifs embedded, but will do ooxml ones
|
||||||
* at some point.
|
* at some point.
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("deprecation")
|
|
||||||
@Test
|
@Test
|
||||||
public void testEmbedded() throws Exception {
|
public void testEmbedded() throws Exception {
|
||||||
POIOLE2TextExtractor ext;
|
final Object[] testObj = {
|
||||||
POITextExtractor[] embeds;
|
"No embeddings", xls, "0-0-0-0-0-0",
|
||||||
|
"Excel", xlsEmb, "6-2-2-2-0-0",
|
||||||
|
"Word", docEmb, "4-1-2-1-0-0",
|
||||||
|
"Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
|
||||||
|
"Outlook", msgEmb, "1-1-0-0-0-0",
|
||||||
|
"Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
|
||||||
|
};
|
||||||
|
|
||||||
// No embeddings
|
for (int i=0; i<testObj.length; i+=3) {
|
||||||
ext = (POIOLE2TextExtractor)
|
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
|
||||||
ExtractorFactory.createExtractor(xls);
|
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
assertEquals(0, embeds.length);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// No embeddings
|
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
|
||||||
ext = (POIOLE2TextExtractor)
|
|
||||||
ExtractorFactory.createExtractor(xls);
|
|
||||||
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
|
||||||
assertEquals(0, embeds.length);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Excel
|
|
||||||
ext = (POIOLE2TextExtractor)
|
|
||||||
ExtractorFactory.createExtractor(xlsEmb);
|
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
assertNotNull(embeds);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Excel
|
|
||||||
ext = (POIOLE2TextExtractor)
|
|
||||||
ExtractorFactory.createExtractor(xlsEmb);
|
|
||||||
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
|
||||||
|
|
||||||
assertEquals(6, embeds.length);
|
|
||||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
|
|
||||||
for (POITextExtractor embed : embeds) {
|
for (POITextExtractor embed : embeds) {
|
||||||
assertTrue(embed.getText().length() > 20);
|
assertTrue(embed.getText().length() > 20);
|
||||||
|
if (embed instanceof SlideShowExtractor) {
|
||||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
numPpt++;
|
||||||
else if (embed instanceof ExcelExtractor) numXls++;
|
} else if (embed instanceof ExcelExtractor) {
|
||||||
else if (embed instanceof WordExtractor) numWord++;
|
numXls++;
|
||||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
} else if (embed instanceof WordExtractor) {
|
||||||
|
numWord++;
|
||||||
|
} else if (embed instanceof OutlookTextExtactor) {
|
||||||
|
numMsg++;
|
||||||
|
} else if (embed instanceof XWPFWordExtractor) {
|
||||||
|
numWordX++;
|
||||||
}
|
}
|
||||||
assertEquals(2, numPpt);
|
|
||||||
assertEquals(2, numXls);
|
|
||||||
assertEquals(2, numWord);
|
|
||||||
assertEquals(0, numMsg);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Word
|
|
||||||
ext = (POIOLE2TextExtractor)
|
|
||||||
ExtractorFactory.createExtractor(docEmb);
|
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
||||||
assertEquals(4, embeds.length);
|
|
||||||
for (POITextExtractor embed : embeds) {
|
|
||||||
assertTrue(embed.getText().length() > 20);
|
|
||||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
||||||
else if (embed instanceof ExcelExtractor) numXls++;
|
|
||||||
else if (embed instanceof WordExtractor) numWord++;
|
|
||||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
||||||
}
|
}
|
||||||
assertEquals(1, numPpt);
|
|
||||||
assertEquals(2, numXls);
|
|
||||||
assertEquals(1, numWord);
|
|
||||||
assertEquals(0, numMsg);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Word which contains an OOXML file
|
final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
|
||||||
ext = (POIOLE2TextExtractor)
|
final String expected = (String)testObj[i+2];
|
||||||
ExtractorFactory.createExtractor(docEmbOOXML);
|
assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
|
|
||||||
assertEquals(3, embeds.length);
|
|
||||||
for (POITextExtractor embed : embeds) {
|
|
||||||
assertTrue(embed.getText().length() > 20);
|
|
||||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
||||||
else if (embed instanceof ExcelExtractor) numXls++;
|
|
||||||
else if (embed instanceof WordExtractor) numWord++;
|
|
||||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
||||||
else if (embed instanceof XWPFWordExtractor) numWordX++;
|
|
||||||
}
|
}
|
||||||
assertEquals(1, numPpt);
|
|
||||||
assertEquals(1, numXls);
|
|
||||||
assertEquals(0, numWord);
|
|
||||||
assertEquals(1, numWordX);
|
|
||||||
assertEquals(0, numMsg);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Outlook
|
|
||||||
ext = (OutlookTextExtactor)
|
|
||||||
ExtractorFactory.createExtractor(msgEmb);
|
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
||||||
assertEquals(1, embeds.length);
|
|
||||||
for (POITextExtractor embed : embeds) {
|
|
||||||
assertTrue(embed.getText().length() > 20);
|
|
||||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
||||||
else if (embed instanceof ExcelExtractor) numXls++;
|
|
||||||
else if (embed instanceof WordExtractor) numWord++;
|
|
||||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
||||||
}
|
}
|
||||||
assertEquals(0, numPpt);
|
|
||||||
assertEquals(0, numXls);
|
|
||||||
assertEquals(1, numWord);
|
|
||||||
assertEquals(0, numMsg);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// Outlook with another outlook file in it
|
|
||||||
ext = (OutlookTextExtactor)
|
|
||||||
ExtractorFactory.createExtractor(msgEmbMsg);
|
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
||||||
assertEquals(1, embeds.length);
|
|
||||||
for (POITextExtractor embed : embeds) {
|
|
||||||
assertTrue(embed.getText().length() > 20);
|
|
||||||
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
||||||
else if (embed instanceof ExcelExtractor) numXls++;
|
|
||||||
else if (embed instanceof WordExtractor) numWord++;
|
|
||||||
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
||||||
}
|
|
||||||
assertEquals(0, numPpt);
|
|
||||||
assertEquals(0, numXls);
|
|
||||||
assertEquals(0, numWord);
|
|
||||||
assertEquals(1, numMsg);
|
|
||||||
ext.close();
|
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
// TODO - Publisher
|
// TODO - Publisher
|
||||||
// TODO - Visio
|
// TODO - Visio
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final String[] EXPECTED_FAILURES = new String[] {
|
private static final String[] EXPECTED_FAILURES = {
|
||||||
// password protected files
|
// password protected files
|
||||||
"spreadsheet/password.xls",
|
"spreadsheet/password.xls",
|
||||||
"spreadsheet/protected_passtika.xlsx",
|
"spreadsheet/protected_passtika.xlsx",
|
||||||
@ -1018,35 +469,24 @@ public class TestExtractorFactory {
|
|||||||
* #59074 - Excel 95 files should give a helpful message, not just
|
* #59074 - Excel 95 files should give a helpful message, not just
|
||||||
* "No supported documents found in the OLE2 stream"
|
* "No supported documents found in the OLE2 stream"
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test(expected = OldExcelFormatException.class)
|
||||||
public void bug59074() throws Exception {
|
public void bug59074() throws Exception {
|
||||||
try {
|
|
||||||
ExtractorFactory.createExtractor(
|
ExtractorFactory.createExtractor(
|
||||||
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
|
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
|
||||||
fail("Old excel formats not supported via ExtractorFactory");
|
|
||||||
} catch (OldExcelFormatException e) {
|
|
||||||
// expected here
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("deprecation")
|
@SuppressWarnings("deprecation")
|
||||||
@Test
|
@Test(expected = IllegalStateException.class)
|
||||||
public void testGetEmbeddedFromXMLExtractor() {
|
public void testGetEmbedFromXMLExtractor() {
|
||||||
try {
|
|
||||||
// currently not implemented
|
// currently not implemented
|
||||||
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
|
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
|
||||||
fail("Unsupported currently");
|
|
||||||
} catch (IllegalStateException e) {
|
|
||||||
// expected here
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
@SuppressWarnings("deprecation")
|
||||||
|
@Test(expected = IllegalStateException.class)
|
||||||
|
public void testGetEmbeddedFromXMLExtractor() {
|
||||||
// currently not implemented
|
// currently not implemented
|
||||||
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
|
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
|
||||||
fail("Unsupported currently");
|
|
||||||
} catch (IllegalStateException e) {
|
|
||||||
// expected here
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
|
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
|
||||||
|
@ -120,10 +120,10 @@ public class TestHxxFEncryption {
|
|||||||
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
|
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||||
File f = sampleDir.getFile(file);
|
File f = sampleDir.getFile(file);
|
||||||
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
|
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||||
POIDocument doc = te1.getDocument();
|
POIDocument doc = (POIDocument)te1.getDocument();
|
||||||
doc.write(bos);
|
doc.write(bos);
|
||||||
doc.close();
|
doc.close();
|
||||||
te1.close();
|
te1.close();
|
||||||
@ -140,25 +140,25 @@ public class TestHxxFEncryption {
|
|||||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||||
File f = sampleDir.getFile(file);
|
File f = sampleDir.getFile(file);
|
||||||
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
|
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
|
||||||
// first remove encryption
|
// first remove encryption
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(null);
|
Biff8EncryptionKey.setCurrentUserPassword(null);
|
||||||
POIDocument doc = te1.getDocument();
|
POIDocument doc = (POIDocument)te1.getDocument();
|
||||||
doc.write(bos);
|
doc.write(bos);
|
||||||
doc.close();
|
doc.close();
|
||||||
te1.close();
|
te1.close();
|
||||||
// then use default setting, which is cryptoapi
|
// then use default setting, which is cryptoapi
|
||||||
String newPass = "newPass";
|
String newPass = "newPass";
|
||||||
POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||||
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
Biff8EncryptionKey.setCurrentUserPassword(newPass);
|
||||||
doc = te2.getDocument();
|
doc = (POIDocument)te2.getDocument();
|
||||||
bos.reset();
|
bos.reset();
|
||||||
doc.write(bos);
|
doc.write(bos);
|
||||||
doc.close();
|
doc.close();
|
||||||
te2.close();
|
te2.close();
|
||||||
// and finally update cryptoapi setting
|
// and finally update cryptoapi setting
|
||||||
POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||||
doc = te3.getDocument();
|
doc = (POIDocument)te3.getDocument();
|
||||||
// need to cache data (i.e. read all data) before changing the key size
|
// need to cache data (i.e. read all data) before changing the key size
|
||||||
if (doc instanceof HSLFSlideShowImpl) {
|
if (doc instanceof HSLFSlideShowImpl) {
|
||||||
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
|
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
|
||||||
@ -175,8 +175,8 @@ public class TestHxxFEncryption {
|
|||||||
doc.close();
|
doc.close();
|
||||||
te3.close();
|
te3.close();
|
||||||
// check the setting
|
// check the setting
|
||||||
POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
|
||||||
doc = te4.getDocument();
|
doc = (POIDocument)te4.getDocument();
|
||||||
ei = doc.getEncryptionInfo();
|
ei = doc.getEncryptionInfo();
|
||||||
assertNotNull(ei);
|
assertNotNull(ei);
|
||||||
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);
|
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);
|
||||||
|
@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
|||||||
import org.apache.poi.openxml4j.opc.PackagePartName;
|
import org.apache.poi.openxml4j.opc.PackagePartName;
|
||||||
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
|
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
|
||||||
import org.apache.poi.sl.draw.DrawPaint;
|
import org.apache.poi.sl.draw.DrawPaint;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.sl.usermodel.PaintStyle;
|
import org.apache.poi.sl.usermodel.PaintStyle;
|
||||||
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
||||||
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
|
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
|
||||||
@ -221,8 +222,8 @@ public class TestXSLFBugs {
|
|||||||
* rID2 -> slide3.xml
|
* rID2 -> slide3.xml
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void bug54916() throws Exception {
|
public void bug54916() throws IOException {
|
||||||
XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
|
try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
|
||||||
XSLFSlide slide;
|
XSLFSlide slide;
|
||||||
|
|
||||||
// Should find 4 slides
|
// Should find 4 slides
|
||||||
@ -230,19 +231,18 @@ public class TestXSLFBugs {
|
|||||||
|
|
||||||
// Check the text, to see we got them in order
|
// Check the text, to see we got them in order
|
||||||
slide = ss.getSlides().get(0);
|
slide = ss.getSlides().get(0);
|
||||||
assertContains(getSlideText(slide), "POI cannot read this");
|
assertContains(getSlideText(ss, slide), "POI cannot read this");
|
||||||
|
|
||||||
slide = ss.getSlides().get(1);
|
slide = ss.getSlides().get(1);
|
||||||
assertContains(getSlideText(slide), "POI can read this");
|
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||||
assertContains(getSlideText(slide), "Has a relationship to another slide");
|
assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
|
||||||
|
|
||||||
slide = ss.getSlides().get(2);
|
slide = ss.getSlides().get(2);
|
||||||
assertContains(getSlideText(slide), "POI can read this");
|
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||||
|
|
||||||
slide = ss.getSlides().get(3);
|
slide = ss.getSlides().get(3);
|
||||||
assertContains(getSlideText(slide), "POI can read this");
|
assertContains(getSlideText(ss, slide), "POI can read this");
|
||||||
|
}
|
||||||
ss.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -311,8 +311,15 @@ public class TestXSLFBugs {
|
|||||||
ss.close();
|
ss.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getSlideText(XSLFSlide slide) {
|
protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
|
||||||
return XSLFPowerPointExtractor.getText(slide, true, false, false);
|
try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
|
||||||
|
// do not auto-close the slideshow
|
||||||
|
extr.setFilesystem(null);
|
||||||
|
extr.setSlidesByDefault(true);
|
||||||
|
extr.setNotesByDefault(false);
|
||||||
|
extr.setMasterByDefault(false);
|
||||||
|
return extr.getText(slide);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -458,7 +465,7 @@ public class TestXSLFBugs {
|
|||||||
|
|
||||||
for (int i = 0; i < slideTexts.length; i++) {
|
for (int i = 0; i < slideTexts.length; i++) {
|
||||||
XSLFSlide slide = ss.getSlides().get(i);
|
XSLFSlide slide = ss.getSlides().get(i);
|
||||||
assertContains(getSlideText(slide), slideTexts[i]);
|
assertContains(getSlideText(ss, slide), slideTexts[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
|
|||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.POITextExtractor;
|
|
||||||
import org.apache.poi.extractor.ExtractorFactory;
|
import org.apache.poi.extractor.ExtractorFactory;
|
||||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -44,21 +45,12 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get text out of the simple file
|
* Get text out of the simple file
|
||||||
* @throws XmlException
|
|
||||||
* @throws OpenXML4JException
|
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testGetSimpleText()
|
public void testGetSimpleText() throws IOException {
|
||||||
throws IOException, XmlException, OpenXML4JException {
|
try (XMLSlideShow xmlA = openPPTX("sample.pptx");
|
||||||
XMLSlideShow xmlA = openPPTX("sample.pptx");
|
SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
|
||||||
@SuppressWarnings("resource")
|
|
||||||
OPCPackage pkg = xmlA.getPackage();
|
|
||||||
|
|
||||||
new XSLFPowerPointExtractor(xmlA).close();
|
|
||||||
new XSLFPowerPointExtractor(pkg).close();
|
|
||||||
|
|
||||||
XSLFPowerPointExtractor extractor =
|
|
||||||
new XSLFPowerPointExtractor(xmlA);
|
|
||||||
extractor.getText();
|
extractor.getText();
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
@ -82,7 +74,10 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
// "Fifth level\n";
|
// "Fifth level\n";
|
||||||
|
|
||||||
// Just slides, no notes
|
// Just slides, no notes
|
||||||
text = extractor.getText(true, false, false);
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(false);
|
||||||
|
extractor.setMasterByDefault(false);
|
||||||
|
text = extractor.getText();
|
||||||
String slideText =
|
String slideText =
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
@ -97,11 +92,15 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
assertEquals(slideText, text);
|
assertEquals(slideText, text);
|
||||||
|
|
||||||
// Just notes, no slides
|
// Just notes, no slides
|
||||||
text = extractor.getText(false, true);
|
extractor.setSlidesByDefault(false);
|
||||||
|
extractor.setNotesByDefault(true);
|
||||||
|
text = extractor.getText();
|
||||||
assertEquals("\n\n1\n\n\n2\n", text);
|
assertEquals("\n\n1\n\n\n2\n", text);
|
||||||
|
|
||||||
// Both
|
// Both
|
||||||
text = extractor.getText(true, true, false);
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(true);
|
||||||
|
text = extractor.getText();
|
||||||
String bothText =
|
String bothText =
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
@ -116,7 +115,10 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
assertEquals(bothText, text);
|
assertEquals(bothText, text);
|
||||||
|
|
||||||
// With Slides and Master Text
|
// With Slides and Master Text
|
||||||
text = extractor.getText(true, false, true);
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(false);
|
||||||
|
extractor.setMasterByDefault(true);
|
||||||
|
text = extractor.getText();
|
||||||
String smText =
|
String smText =
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
@ -131,7 +133,10 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
assertEquals(smText, text);
|
assertEquals(smText, text);
|
||||||
|
|
||||||
// With Slides, Notes and Master Text
|
// With Slides, Notes and Master Text
|
||||||
text = extractor.getText(true, true, true);
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(true);
|
||||||
|
extractor.setMasterByDefault(true);
|
||||||
|
text = extractor.getText();
|
||||||
String snmText =
|
String snmText =
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
@ -150,14 +155,14 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
extractor.setNotesByDefault(true);
|
extractor.setNotesByDefault(true);
|
||||||
text = extractor.getText();
|
text = extractor.getText();
|
||||||
assertEquals("\n\n1\n\n\n2\n", text);
|
assertEquals("\n\n1\n\n\n2\n", text);
|
||||||
|
}
|
||||||
extractor.close();
|
|
||||||
xmlA.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testGetComments() throws IOException {
|
public void testGetComments() throws IOException {
|
||||||
XMLSlideShow xml = openPPTX("45545_Comment.pptx");
|
try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
|
||||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||||
|
extractor.setCommentsByDefault(true);
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
assertTrue(text.length() > 0);
|
assertTrue(text.length() > 0);
|
||||||
@ -168,18 +173,19 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
|
|
||||||
// Check the authors came through too
|
// Check the authors came through too
|
||||||
assertContains(text, "XPVMWARE01");
|
assertContains(text, "XPVMWARE01");
|
||||||
|
}
|
||||||
extractor.close();
|
|
||||||
xml.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore("currently slidelayouts aren't yet supported")
|
||||||
public void testGetMasterText() throws Exception {
|
public void testGetMasterText() throws Exception {
|
||||||
XMLSlideShow xml = openPPTX("WithMaster.pptx");
|
try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
|
||||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||||
extractor.setSlidesByDefault(true);
|
extractor.setSlidesByDefault(true);
|
||||||
extractor.setNotesByDefault(false);
|
extractor.setNotesByDefault(false);
|
||||||
extractor.setMasterByDefault(true);
|
extractor.setMasterByDefault(true);
|
||||||
|
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
assertTrue(text.length() > 0);
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
@ -208,24 +214,20 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
"This is the Master Title\n" +
|
"This is the Master Title\n" +
|
||||||
"This text comes from the Master Slide\n";
|
"This text comes from the Master Slide\n";
|
||||||
assertEquals(wholeText, text);
|
assertEquals(wholeText, text);
|
||||||
|
}
|
||||||
extractor.close();
|
|
||||||
xml.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTable() throws Exception {
|
public void testTable() throws Exception {
|
||||||
XMLSlideShow xml = openPPTX("present1.pptx");
|
try (XMLSlideShow xml = openPPTX("present1.pptx");
|
||||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
assertTrue(text.length() > 0);
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
// Check comments are there
|
// Check comments are there
|
||||||
assertContains(text, "TEST");
|
assertContains(text, "TEST");
|
||||||
|
}
|
||||||
extractor.close();
|
|
||||||
xml.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -241,8 +243,9 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
};
|
};
|
||||||
for(String extension : extensions) {
|
for(String extension : extensions) {
|
||||||
String filename = "testPPT." + extension;
|
String filename = "testPPT." + extension;
|
||||||
XMLSlideShow xml = openPPTX(filename);
|
|
||||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
try (XMLSlideShow xml = openPPTX(filename);
|
||||||
|
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
if (extension.equals("thmx")) {
|
if (extension.equals("thmx")) {
|
||||||
@ -257,58 +260,59 @@ public class TestXSLFPowerPointExtractor {
|
|||||||
assertContains(filename, text, "content parsing");
|
assertContains(filename, text, "content parsing");
|
||||||
assertContains(filename, text, "Different words to test against");
|
assertContains(filename, text, "Different words to test against");
|
||||||
assertContains(filename, text, "Mystery");
|
assertContains(filename, text, "Mystery");
|
||||||
|
}
|
||||||
extractor.close();
|
|
||||||
xml.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test45541() throws Exception {
|
public void test45541() throws IOException, OpenXML4JException, XmlException {
|
||||||
// extract text from a powerpoint that has a header in the notes-element
|
// extract text from a powerpoint that has a header in the notes-element
|
||||||
POITextExtractor extr = ExtractorFactory.createExtractor(
|
final File headerFile = slTests.getFile("45541_Header.pptx");
|
||||||
slTests.getFile("45541_Header.pptx"));
|
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
|
||||||
String text = extr.getText();
|
String text = extr.getText();
|
||||||
assertNotNull(text);
|
assertNotNull(text);
|
||||||
assertFalse("Had: " + text, text.contains("testdoc"));
|
assertFalse("Had: " + text, text.contains("testdoc"));
|
||||||
|
|
||||||
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
|
extr.setSlidesByDefault(false);
|
||||||
|
extr.setNotesByDefault(true);
|
||||||
|
|
||||||
|
text = extr.getText();
|
||||||
assertContains(text, "testdoc");
|
assertContains(text, "testdoc");
|
||||||
extr.close();
|
|
||||||
assertNotNull(text);
|
assertNotNull(text);
|
||||||
|
}
|
||||||
|
|
||||||
// extract text from a powerpoint that has a footer in the master-slide
|
// extract text from a powerpoint that has a footer in the master-slide
|
||||||
extr = ExtractorFactory.createExtractor(
|
final File footerFile = slTests.getFile("45541_Footer.pptx");
|
||||||
slTests.getFile("45541_Footer.pptx"));
|
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
|
||||||
|
String text = extr.getText();
|
||||||
|
assertNotContained(text, "testdoc");
|
||||||
|
|
||||||
|
extr.setSlidesByDefault(false);
|
||||||
|
extr.setNotesByDefault(true);
|
||||||
text = extr.getText();
|
text = extr.getText();
|
||||||
assertNotContained(text, "testdoc");
|
assertNotContained(text, "testdoc");
|
||||||
|
|
||||||
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
|
extr.setSlidesByDefault(false);
|
||||||
|
extr.setNotesByDefault(false);
|
||||||
|
extr.setMasterByDefault(true);
|
||||||
|
text = extr.getText();
|
||||||
assertNotContained(text, "testdoc");
|
assertNotContained(text, "testdoc");
|
||||||
|
}
|
||||||
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
|
|
||||||
assertNotContained(text, "testdoc");
|
|
||||||
|
|
||||||
extr.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void bug54570() throws IOException {
|
public void bug54570() throws IOException {
|
||||||
XMLSlideShow xml = openPPTX("bug54570.pptx");
|
try (XMLSlideShow xml = openPPTX("bug54570.pptx");
|
||||||
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
|
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
assertNotNull(text);
|
assertNotNull(text);
|
||||||
extractor.close();
|
}
|
||||||
xml.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private XMLSlideShow openPPTX(String file) throws IOException {
|
private XMLSlideShow openPPTX(String file) throws IOException {
|
||||||
InputStream is = slTests.openResourceAsStream(file);
|
try (InputStream is = slTests.openResourceAsStream(file)) {
|
||||||
try {
|
|
||||||
return new XMLSlideShow(is);
|
return new XMLSlideShow(is);
|
||||||
} finally {
|
|
||||||
is.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
|
|||||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.Entry;
|
import org.apache.poi.poifs.filesystem.Entry;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
|
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
|
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
|
||||||
@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
|
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
|
||||||
return new PowerPointExtractor(poifsDir);
|
return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (poifsDir.hasEntry("VisioDocument")) {
|
if (poifsDir.hasEntry("VisioDocument")) {
|
||||||
|
@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
|||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||||
|
import org.apache.poi.util.Removal;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class can be used to extract text from a PowerPoint file. Can optionally
|
* This class can be used to extract text from a PowerPoint file. Can optionally
|
||||||
@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
|
|||||||
*/
|
*/
|
||||||
@SuppressWarnings("WeakerAccess")
|
@SuppressWarnings("WeakerAccess")
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
@Removal(version="5.0.0")
|
||||||
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||||
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
|
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
|
||||||
|
|
||||||
|
@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
|
|||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
_hslfSlideShow.close();
|
_hslfSlideShow.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getPersistDocument() {
|
||||||
|
return getSlideShowImpl();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
import org.apache.poi.sl.usermodel.SlideShow;
|
|
||||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
|
|
||||||
@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
|
|||||||
@Internal
|
@Internal
|
||||||
public class HSLFSlideShowFactory extends SlideShowFactory {
|
public class HSLFSlideShowFactory extends SlideShowFactory {
|
||||||
/**
|
/**
|
||||||
* Creates a HSLFSlideShow from the given NPOIFSFileSystem
|
* Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
|
||||||
* <p>Note that in order to properly release resources the
|
* Note that in order to properly release resources the
|
||||||
* SlideShow should be closed after use.
|
* SlideShow should be closed after use.
|
||||||
*/
|
*/
|
||||||
public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
|
public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
|
||||||
return new HSLFSlideShow(fs);
|
return new HSLFSlideShow(fs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a HSLFSlideShow from the given DirectoryNode<p>
|
||||||
|
* Note that in order to properly release resources the
|
||||||
|
* SlideShow should be closed after use.
|
||||||
|
*/
|
||||||
|
public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
|
||||||
|
return new HSLFSlideShow(root);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -846,11 +846,15 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
// only close the filesystem, if we are based on the root node.
|
||||||
|
// embedded documents/slideshows shouldn't close the parent container
|
||||||
|
if (getDirectory().getParent() == null) {
|
||||||
NPOIFSFileSystem fs = getDirectory().getFileSystem();
|
NPOIFSFileSystem fs = getDirectory().getFileSystem();
|
||||||
if (fs != null) {
|
if (fs != null) {
|
||||||
fs.close();
|
fs.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getEncryptedPropertyStreamName() {
|
protected String getEncryptedPropertyStreamName() {
|
||||||
|
@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
|
|||||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
|
import org.apache.poi.sl.usermodel.ObjectShape;
|
||||||
|
import org.apache.poi.sl.usermodel.SlideShow;
|
||||||
|
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||||
import org.apache.poi.util.IOUtils;
|
import org.apache.poi.util.IOUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@ -76,43 +80,46 @@ public final class TestExtractor {
|
|||||||
// ppe.close();
|
// ppe.close();
|
||||||
// }
|
// }
|
||||||
|
|
||||||
private PowerPointExtractor openExtractor(String fileName) throws IOException {
|
private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
|
||||||
InputStream is = slTests.openResourceAsStream(fileName);
|
try (InputStream is = slTests.openResourceAsStream(fileName)) {
|
||||||
try {
|
return new SlideShowExtractor(SlideShowFactory.create(is));
|
||||||
return new PowerPointExtractor(is);
|
|
||||||
} finally {
|
|
||||||
is.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReadSheetText() throws IOException {
|
public void testReadSheetText() throws IOException {
|
||||||
// Basic 2 page example
|
// Basic 2 page example
|
||||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||||
assertEquals(expectText, ppe.getText());
|
assertEquals(expectText, ppe.getText());
|
||||||
ppe.close();
|
}
|
||||||
|
|
||||||
// 1 page example with text boxes
|
// 1 page example with text boxes
|
||||||
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
|
||||||
assertEquals(expectText2, ppe2.getText());
|
assertEquals(expectText2, ppe.getText());
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReadNoteText() throws IOException {
|
public void testReadNoteText() throws IOException {
|
||||||
// Basic 2 page example
|
// Basic 2 page example
|
||||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||||
String notesText = ppe.getNotes();
|
ppe.setNotesByDefault(true);
|
||||||
|
ppe.setSlidesByDefault(false);
|
||||||
|
ppe.setMasterByDefault(false);
|
||||||
|
String notesText = ppe.getText();
|
||||||
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
|
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
|
||||||
assertEquals(expText, notesText);
|
assertEquals(expText, notesText);
|
||||||
ppe.close();
|
}
|
||||||
|
|
||||||
// Other one doesn't have notes
|
// Other one doesn't have notes
|
||||||
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
|
||||||
notesText = ppe2.getNotes();
|
ppe.setNotesByDefault(true);
|
||||||
expText = "";
|
ppe.setSlidesByDefault(false);
|
||||||
|
ppe.setMasterByDefault(false);
|
||||||
|
String notesText = ppe.getText();
|
||||||
|
String expText = "";
|
||||||
assertEquals(expText, notesText);
|
assertEquals(expText, notesText);
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -126,7 +133,7 @@ public final class TestExtractor {
|
|||||||
"\nThese are the notes on page two, again lacking formatting\n"
|
"\nThese are the notes on page two, again lacking formatting\n"
|
||||||
};
|
};
|
||||||
|
|
||||||
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
|
||||||
ppe.setSlidesByDefault(true);
|
ppe.setSlidesByDefault(true);
|
||||||
ppe.setNotesByDefault(false);
|
ppe.setNotesByDefault(false);
|
||||||
assertEquals(slText[0] + slText[1], ppe.getText());
|
assertEquals(slText[0] + slText[1], ppe.getText());
|
||||||
@ -138,7 +145,7 @@ public final class TestExtractor {
|
|||||||
ppe.setSlidesByDefault(true);
|
ppe.setSlidesByDefault(true);
|
||||||
ppe.setNotesByDefault(true);
|
ppe.setNotesByDefault(true);
|
||||||
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
|
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -149,10 +156,13 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testMissingCoreRecords() throws IOException {
|
public void testMissingCoreRecords() throws IOException {
|
||||||
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
|
try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
|
||||||
|
ppe.setSlidesByDefault(true);
|
||||||
String text = ppe.getText(true, false);
|
ppe.setNotesByDefault(false);
|
||||||
String nText = ppe.getNotes();
|
String text = ppe.getText();
|
||||||
|
ppe.setSlidesByDefault(false);
|
||||||
|
ppe.setNotesByDefault(true);
|
||||||
|
String nText = ppe.getText();
|
||||||
|
|
||||||
assertNotNull(text);
|
assertNotNull(text);
|
||||||
assertNotNull(nText);
|
assertNotNull(nText);
|
||||||
@ -162,32 +172,30 @@ public final class TestExtractor {
|
|||||||
|
|
||||||
// Slide records were fine
|
// Slide records were fine
|
||||||
assertContains(text, "Using Disease Surveillance and Response");
|
assertContains(text, "Using Disease Surveillance and Response");
|
||||||
|
}
|
||||||
ppe.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractFromEmbeded() throws IOException {
|
public void testExtractFromEmbeded() throws IOException {
|
||||||
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
|
try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
|
||||||
POIFSFileSystem fs = new POIFSFileSystem(is);
|
final POIFSFileSystem fs = new POIFSFileSystem(is)) {
|
||||||
DirectoryNode root = fs.getRoot();
|
final DirectoryNode root = fs.getRoot();
|
||||||
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
|
|
||||||
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
|
|
||||||
ppe2.close();
|
|
||||||
ppe1.close();
|
|
||||||
fs.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
|
final String[] TEST_SET = {
|
||||||
throws IOException {
|
"MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
|
||||||
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
|
"MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i=0; i<TEST_SET.length; i+=2) {
|
||||||
|
DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
|
||||||
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
|
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
|
||||||
|
|
||||||
// Check the first file
|
try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
|
||||||
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
|
final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
|
||||||
PowerPointExtractor ppe = new PowerPointExtractor(ppt);
|
assertEquals(TEST_SET[i+1], ppe.getText());
|
||||||
assertEquals(expected, ppe.getText(true, false));
|
}
|
||||||
return ppe;
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -195,12 +203,12 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testExtractFromOwnEmbeded() throws IOException {
|
public void testExtractFromOwnEmbeded() throws IOException {
|
||||||
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
|
try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
|
||||||
List<HSLFObjectShape> shapes = ppe.getOLEShapes();
|
List<? extends ObjectShape> shapes = ppe.getOLEShapes();
|
||||||
assertEquals("Expected 6 ole shapes", 6, shapes.size());
|
assertEquals("Expected 6 ole shapes", 6, shapes.size());
|
||||||
int num_ppt = 0, num_doc = 0, num_xls = 0;
|
int num_ppt = 0, num_doc = 0, num_xls = 0;
|
||||||
for (HSLFObjectShape ole : shapes) {
|
for (ObjectShape ole : shapes) {
|
||||||
String name = ole.getInstanceName();
|
String name = ((HSLFObjectShape)ole).getInstanceName();
|
||||||
InputStream data = ole.getObjectData().getInputStream();
|
InputStream data = ole.getObjectData().getInputStream();
|
||||||
if ("Worksheet".equals(name)) {
|
if ("Worksheet".equals(name)) {
|
||||||
HSSFWorkbook wb = new HSSFWorkbook(data);
|
HSSFWorkbook wb = new HSSFWorkbook(data);
|
||||||
@ -220,7 +228,7 @@ public final class TestExtractor {
|
|||||||
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
|
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
|
||||||
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
|
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
|
||||||
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
|
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -228,11 +236,11 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void test52991() throws IOException {
|
public void test52991() throws IOException {
|
||||||
PowerPointExtractor ppe = openExtractor("badzip.ppt");
|
try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
|
||||||
for (HSLFObjectShape shape : ppe.getOLEShapes()) {
|
for (ObjectShape shape : ppe.getOLEShapes()) {
|
||||||
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
|
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
|
||||||
}
|
}
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -240,27 +248,27 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testWithComments() throws IOException {
|
public void testWithComments() throws IOException {
|
||||||
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
|
try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
|
||||||
String text = ppe1.getText();
|
String text = ppe.getText();
|
||||||
assertFalse("Comments not in by default", text.contains("This is a test comment"));
|
assertFalse("Comments not in by default", text.contains("This is a test comment"));
|
||||||
|
|
||||||
ppe1.setCommentsByDefault(true);
|
ppe.setCommentsByDefault(true);
|
||||||
|
|
||||||
text = ppe1.getText();
|
text = ppe.getText();
|
||||||
assertContains(text, "This is a test comment");
|
assertContains(text, "This is a test comment");
|
||||||
ppe1.close();
|
}
|
||||||
|
|
||||||
|
|
||||||
// And another file
|
// And another file
|
||||||
PowerPointExtractor ppe2 = openExtractor("45543.ppt");
|
try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
|
||||||
text = ppe2.getText();
|
String text = ppe.getText();
|
||||||
assertFalse("Comments not in by default", text.contains("testdoc"));
|
assertFalse("Comments not in by default", text.contains("testdoc"));
|
||||||
|
|
||||||
ppe2.setCommentsByDefault(true);
|
ppe.setCommentsByDefault(true);
|
||||||
|
|
||||||
text = ppe2.getText();
|
text = ppe.getText();
|
||||||
assertContains(text, "testdoc");
|
assertContains(text, "testdoc");
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -268,48 +276,37 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testHeaderFooter() throws IOException {
|
public void testHeaderFooter() throws IOException {
|
||||||
String text;
|
|
||||||
|
|
||||||
// With a header on the notes
|
// With a header on the notes
|
||||||
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
|
try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
|
||||||
HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
|
HSLFSlideShow ppt = new HSLFSlideShow(is)) {
|
||||||
is1.close();
|
|
||||||
assertNotNull(ppt1.getNotesHeadersFooters());
|
|
||||||
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
|
|
||||||
|
|
||||||
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
|
assertNotNull(ppt.getNotesHeadersFooters());
|
||||||
|
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
|
||||||
|
|
||||||
text = ppe1.getText();
|
testHeaderFooterInner(ppt);
|
||||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
}
|
||||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
|
||||||
|
|
||||||
ppe1.setNotesByDefault(true);
|
|
||||||
text = ppe1.getText();
|
|
||||||
assertContains(text, "testdoc");
|
|
||||||
assertContains(text, "test phrase");
|
|
||||||
ppe1.close();
|
|
||||||
ppt1.close();
|
|
||||||
|
|
||||||
// And with a footer, also on notes
|
// And with a footer, also on notes
|
||||||
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
|
try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
|
||||||
HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
|
final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
|
||||||
is2.close();
|
assertNotNull(ppt.getNotesHeadersFooters());
|
||||||
|
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
|
||||||
|
|
||||||
assertNotNull(ppt2.getNotesHeadersFooters());
|
testHeaderFooterInner(ppt);
|
||||||
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
|
}
|
||||||
ppt2.close();
|
}
|
||||||
|
|
||||||
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
|
private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
|
||||||
|
try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
|
||||||
text = ppe2.getText();
|
String text = ppe.getText();
|
||||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
||||||
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
||||||
|
|
||||||
ppe2.setNotesByDefault(true);
|
ppe.setNotesByDefault(true);
|
||||||
text = ppe2.getText();
|
text = ppe.getText();
|
||||||
assertContains(text, "testdoc");
|
assertContains(text, "testdoc");
|
||||||
assertContains(text, "test phrase");
|
assertContains(text, "test phrase");
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
@ -318,41 +315,40 @@ public final class TestExtractor {
|
|||||||
String masterTitleText = "This is the Master Title";
|
String masterTitleText = "This is the Master Title";
|
||||||
String masterRandomText = "This text comes from the Master Slide";
|
String masterRandomText = "This text comes from the Master Slide";
|
||||||
String masterFooterText = "Footer from the master slide";
|
String masterFooterText = "Footer from the master slide";
|
||||||
PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
|
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
|
||||||
ppe.setMasterByDefault(true);
|
ppe.setMasterByDefault(true);
|
||||||
|
|
||||||
String text = ppe.getText();
|
String text = ppe.getText();
|
||||||
assertContains(text, masterRandomText);
|
assertContains(text, masterRandomText);
|
||||||
assertContains(text, masterFooterText);
|
assertContains(text, masterFooterText);
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMasterText() throws IOException {
|
public void testMasterText() throws IOException {
|
||||||
PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
|
try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
|
||||||
|
|
||||||
// Initially not there
|
// Initially not there
|
||||||
String text = ppe1.getText();
|
String text = ppe.getText();
|
||||||
assertFalse(text.contains("Text that I added to the master slide"));
|
assertFalse(text.contains("Text that I added to the master slide"));
|
||||||
|
|
||||||
// Enable, shows up
|
// Enable, shows up
|
||||||
ppe1.setMasterByDefault(true);
|
ppe.setMasterByDefault(true);
|
||||||
text = ppe1.getText();
|
text = ppe.getText();
|
||||||
assertContains(text, "Text that I added to the master slide");
|
assertContains(text, "Text that I added to the master slide");
|
||||||
|
|
||||||
// Make sure placeholder text does not come out
|
// Make sure placeholder text does not come out
|
||||||
assertNotContained(text, "Click to edit Master");
|
assertNotContained(text, "Click to edit Master");
|
||||||
ppe1.close();
|
}
|
||||||
|
|
||||||
// Now with another file only containing master text
|
// Now with another file only containing master text
|
||||||
// Will always show up
|
// Will always show up
|
||||||
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
|
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
|
||||||
String masterText = "Footer from the master slide";
|
String masterText = "Footer from the master slide";
|
||||||
|
|
||||||
text = ppe2.getText();
|
String text = ppe.getText();
|
||||||
assertContainsIgnoreCase(text, "master");
|
assertContainsIgnoreCase(text, "master");
|
||||||
assertContains(text, masterText);
|
assertContains(text, masterText);
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -360,8 +356,7 @@ public final class TestExtractor {
|
|||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testChineseText() throws IOException {
|
public void testChineseText() throws IOException {
|
||||||
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
|
try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
|
||||||
|
|
||||||
String text = ppe.getText();
|
String text = ppe.getText();
|
||||||
|
|
||||||
// Check for the english text line
|
// Check for the english text line
|
||||||
@ -375,7 +370,7 @@ public final class TestExtractor {
|
|||||||
|
|
||||||
// Check for the chinese only text line
|
// Check for the chinese only text line
|
||||||
assertContains(text, "\uff8a\uff9d\uff76\uff78");
|
assertContains(text, "\uff8a\uff9d\uff76\uff78");
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -387,67 +382,59 @@ public final class TestExtractor {
|
|||||||
public void testDifferentPOIFS() throws IOException {
|
public void testDifferentPOIFS() throws IOException {
|
||||||
// Open the two filesystems
|
// Open the two filesystems
|
||||||
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
|
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
|
||||||
InputStream is1 = new FileInputStream(pptFile);
|
try (final InputStream is1 = new FileInputStream(pptFile);
|
||||||
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
|
final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
|
||||||
is1.close();
|
|
||||||
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
|
final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
|
||||||
|
|
||||||
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
|
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
|
||||||
|
|
||||||
// Open directly
|
// Open directly
|
||||||
for (DirectoryNode dir : files) {
|
for (DirectoryNode dir : files) {
|
||||||
PowerPointExtractor extractor = new PowerPointExtractor(dir);
|
try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
|
||||||
|
SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
|
||||||
assertEquals(expectText, extractor.getText());
|
assertEquals(expectText, extractor.getText());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Open via a HSLFSlideShow
|
|
||||||
for (DirectoryNode dir : files) {
|
|
||||||
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
|
|
||||||
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
|
|
||||||
assertEquals(expectText, extractor.getText());
|
|
||||||
extractor.close();
|
|
||||||
slideshow.close();
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
npoifs.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTable() throws Exception {
|
public void testTable() throws Exception {
|
||||||
PowerPointExtractor ppe1 = openExtractor("54111.ppt");
|
try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
|
||||||
String text1 = ppe1.getText();
|
String text = ppe.getText();
|
||||||
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
|
String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
|
||||||
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
|
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
|
||||||
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
|
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
|
||||||
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
|
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
|
||||||
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
|
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
|
||||||
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
|
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
|
||||||
assertContains(text1, target1);
|
assertContains(text, target);
|
||||||
ppe1.close();
|
}
|
||||||
|
|
||||||
PowerPointExtractor ppe2 = openExtractor("54722.ppt");
|
try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
|
||||||
String text2 = ppe2.getText();
|
String text = ppe.getText();
|
||||||
|
|
||||||
String target2 = "this\tText\tis\twithin\ta\n" +
|
String target = "this\tText\tis\twithin\ta\n" +
|
||||||
"table\t1\t2\t3\t4";
|
"table\t1\t2\t3\t4";
|
||||||
assertContains(text2, target2);
|
assertContains(text, target);
|
||||||
ppe2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// bug 60003
|
// bug 60003
|
||||||
@Test
|
@Test
|
||||||
public void testExtractMasterSlideFooterText() throws Exception {
|
public void testExtractMasterSlideFooterText() throws Exception {
|
||||||
PowerPointExtractor ppe = openExtractor("60003.ppt");
|
try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
|
||||||
ppe.setMasterByDefault(true);
|
ppe.setMasterByDefault(true);
|
||||||
|
|
||||||
String text = ppe.getText();
|
String text = ppe.getText();
|
||||||
assertContains(text, "Prague");
|
assertContains(text, "Prague");
|
||||||
ppe.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractGroupedShapeText() throws Exception {
|
public void testExtractGroupedShapeText() throws Exception {
|
||||||
try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
|
try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
|
||||||
final String text = ppe.getText();
|
final String text = ppe.getText();
|
||||||
|
|
||||||
//this tests that we're ignoring text shapes at depth=0
|
//this tests that we're ignoring text shapes at depth=0
|
||||||
|
@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
|
|||||||
import org.apache.poi.sl.draw.DrawFactory;
|
import org.apache.poi.sl.draw.DrawFactory;
|
||||||
import org.apache.poi.sl.draw.DrawPaint;
|
import org.apache.poi.sl.draw.DrawPaint;
|
||||||
import org.apache.poi.sl.draw.DrawTextParagraph;
|
import org.apache.poi.sl.draw.DrawTextParagraph;
|
||||||
|
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||||
import org.apache.poi.sl.usermodel.ColorStyle;
|
import org.apache.poi.sl.usermodel.ColorStyle;
|
||||||
import org.apache.poi.sl.usermodel.PaintStyle;
|
import org.apache.poi.sl.usermodel.PaintStyle;
|
||||||
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
|
||||||
@ -800,18 +801,18 @@ public final class TestBugs {
|
|||||||
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
|
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
|
||||||
for (String f : files) {
|
for (String f : files) {
|
||||||
File sample = HSLFTestDataSamples.getSampleFile(f);
|
File sample = HSLFTestDataSamples.getSampleFile(f);
|
||||||
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
|
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
|
||||||
assertNotNull(ex.getText());
|
assertNotNull(ex.getText());
|
||||||
ex.close();
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void bug58733() throws IOException {
|
public void bug58733() throws IOException {
|
||||||
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
|
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
|
||||||
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
|
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
|
||||||
assertNotNull(ex.getText());
|
assertNotNull(ex.getText());
|
||||||
ex.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user