#62319 - Decommission XSLF-/PowerPointExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2018-04-20 12:52:59 +00:00
parent bc436fcc3d
commit ab390ce170
27 changed files with 824 additions and 1248 deletions

View File

@ -330,8 +330,6 @@ public class TestAllFiles {
);
private static final Set<String> IGNORED = unmodifiableHashSet(
// need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
"slideshow/42474-2.ppt",
// OPC handler works / XSSF handler fails
"spreadsheet/57181.xlsm",
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop

View File

@ -24,6 +24,7 @@ import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
// additionally try the other getText() methods
try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
assertNotNull(extractor.getText(true, true, true));
assertEquals("With all options disabled we should not get text",
"", extractor.getText(false, false, false));
assertNotNull(extractor.getText());
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
assertEquals("With all options disabled we should not get text", "", extractor.getText());
}
}

View File

@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
*
* @return the underlying POIDocument
*/
@Override
public POIDocument getDocument() {
return document;
}

View File

@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
fsToClose.close();
}
}
/**
* @return the processed document
*/
public abstract Object getDocument();
}

View File

@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
return threadPreferEventExtractors.get();
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(InputStream input) throws IOException {
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass();
if (cls != null) {
// Use Reflection to get us the full OOXML-enabled version
try {
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
return (POITextExtractor)m.invoke(null, input);
return (T)m.invoke(null, input);
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {

View File

@ -44,8 +44,30 @@ public class DocumentFactoryHelper {
* @throws IOException If an error occurs while decrypting or if the password does not match
*/
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
throws IOException {
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
// as well when the resulting OPCPackage is closed
return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
}
/**
* Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
* the given password.
*
* @param root The OLE2 directory node for the document
* @param password The password, null if the default password should be used
* @return A stream for reading the decrypted data
* @throws IOException If an error occurs while decrypting or if the password does not match
*/
public static InputStream getDecryptedStream(final DirectoryNode root, String password)
throws IOException {
EncryptionInfo info = new EncryptionInfo(fs);
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
@ -58,21 +80,11 @@ public class DocumentFactoryHelper {
}
if (passwordCorrect) {
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
// as well when the resulting OPCPackage is closed
return new FilterInputStream(d.getDataStream(fs.getRoot())) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
return d.getDataStream(root);
} else if (password != null) {
throw new EncryptedDocumentException("Password incorrect");
} else {
if (password != null)
throw new EncryptedDocumentException("Password incorrect");
else
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
}
} catch (GeneralSecurityException e) {
throw new IOException(e);

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.sl.extractor;
import java.util.ArrayList;
@ -48,6 +65,16 @@ public class SlideShowExtractor<
this.slideshow = slideshow;
}
/**
* Returns opened document
*
* @return the opened document
*/
@Override
public final Object getDocument() {
return slideshow.getPersistDocument();
}
/**
* Should a call to getText() return slide text? Default is yes
*/
@ -219,7 +246,6 @@ public class SlideShowExtractor<
return;
}
for (final P para : paraList) {
final int oldLen = sb.length();
for (final TextRun tr : para) {
final String str = tr.getRawText().replace("\r", "");
final String newStr;

View File

@ -126,4 +126,13 @@ public interface SlideShow<
* @since POI 4.0.0
*/
POITextExtractor getMetadataTextExtractor();
/**
* @return the instance which handles the persisting of the slideshow,
* which is either a subclass of {@link org.apache.poi.POIDocument}
* or {@link org.apache.poi.POIXMLDocument}
*
* @since POI 4.0.0
*/
Object getPersistDocument();
}

View File

@ -60,13 +60,40 @@ public class SlideShowFactory {
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
DirectoryNode root = fs.getRoot();
return create(fs.getRoot(), password);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem.
*
* @param root The {@link DirectoryNode} to start reading the document from
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
return create(root, null);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem, which may
* be password protected
*
* @param root The {@link DirectoryNode} to start reading the document from
* @param password The password that should be used or null if no password is necessary.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
// Encrypted OOXML files go inside OLE2 containers, is this one?
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
InputStream stream = null;
try {
stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
stream = DocumentFactoryHelper.getDecryptedStream(root, password);
return createXSLFSlideShow(stream);
} finally {
@ -82,7 +109,7 @@ public class SlideShowFactory {
passwordSet = true;
}
try {
return createHSLFSlideShow(fs);
return createHSLFSlideShow(root);
} finally {
if (passwordSet) {
Biff8EncryptionKey.setCurrentUserPassword(null);

View File

@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
*
* @return the opened document
*/
@Override
public final POIXMLDocument getDocument() {
return _document;
}

View File

@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.POILogFactory;
@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
import org.apache.poi.util.Removal;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@ -127,20 +129,20 @@ public class ExtractorFactory {
return OLE2ExtractorFactory.getPreferEventExtractor();
}
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null;
try {
fs = new NPOIFSFileSystem(f);
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
return createEncryptedOOXMLExtractor(fs);
return (T)createEncryptedOOXMLExtractor(fs);
}
POIOLE2TextExtractor extractor = createExtractor(fs);
POITextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
return (T)extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
@ -179,7 +181,7 @@ public class ExtractorFactory {
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
@ -226,13 +228,13 @@ public class ExtractorFactory {
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new XSLFPowerPointExtractor(pkg);
return new SlideShowExtractor(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
return new SlideShowExtractor(new XMLSlideShow(pkg));
}
// How about xlsb?
@ -252,28 +254,28 @@ public class ExtractorFactory {
}
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg);
return (T)createExtractor(pkg);
}
}
// If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir);
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**
@ -403,7 +405,7 @@ public class ExtractorFactory {
throw new IllegalStateException("Not yet supported");
}
private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) {

View File

@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
* @deprecated use {@link SlideShowExtractor}
*/
@Deprecated
@Removal(version="4.2.0")
@Removal(version="5.0.0")
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,

View File

@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return new POIXMLPropertiesTextExtractor(this);
}
@Override
public Object getPersistDocument() {
return this;
}
}

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel;
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;

View File

@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
*/
public XSLFCommentAuthors getCommentAuthorsPart() {
if(_commentAuthors == null) {
// first scan the slide relations
for (POIXMLDocumentPart p : getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
// then scan the presentation relations
for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
}
return null;

View File

@ -120,10 +120,10 @@ public class TestHxxFEncryption {
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
Biff8EncryptionKey.setCurrentUserPassword(newPass);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
POIDocument doc = te1.getDocument();
POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
@ -140,25 +140,25 @@ public class TestHxxFEncryption {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
// first remove encryption
Biff8EncryptionKey.setCurrentUserPassword(null);
POIDocument doc = te1.getDocument();
POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
// then use default setting, which is cryptoapi
String newPass = "newPass";
POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
Biff8EncryptionKey.setCurrentUserPassword(newPass);
doc = te2.getDocument();
doc = (POIDocument)te2.getDocument();
bos.reset();
doc.write(bos);
doc.close();
te2.close();
// and finally update cryptoapi setting
POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te3.getDocument();
POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = (POIDocument)te3.getDocument();
// need to cache data (i.e. read all data) before changing the key size
if (doc instanceof HSLFSlideShowImpl) {
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
@ -175,8 +175,8 @@ public class TestHxxFEncryption {
doc.close();
te3.close();
// check the setting
POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te4.getDocument();
POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = (POIDocument)te4.getDocument();
ei = doc.getEncryptionInfo();
assertNotNull(ei);
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);

View File

@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
@ -221,28 +222,27 @@ public class TestXSLFBugs {
* rID2 -> slide3.xml
*/
@Test
public void bug54916() throws Exception {
XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
XSLFSlide slide;
public void bug54916() throws IOException {
try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
XSLFSlide slide;
// Should find 4 slides
assertEquals(4, ss.getSlides().size());
// Should find 4 slides
assertEquals(4, ss.getSlides().size());
// Check the text, to see we got them in order
slide = ss.getSlides().get(0);
assertContains(getSlideText(slide), "POI cannot read this");
// Check the text, to see we got them in order
slide = ss.getSlides().get(0);
assertContains(getSlideText(ss, slide), "POI cannot read this");
slide = ss.getSlides().get(1);
assertContains(getSlideText(slide), "POI can read this");
assertContains(getSlideText(slide), "Has a relationship to another slide");
slide = ss.getSlides().get(1);
assertContains(getSlideText(ss, slide), "POI can read this");
assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
slide = ss.getSlides().get(2);
assertContains(getSlideText(slide), "POI can read this");
slide = ss.getSlides().get(2);
assertContains(getSlideText(ss, slide), "POI can read this");
slide = ss.getSlides().get(3);
assertContains(getSlideText(slide), "POI can read this");
ss.close();
slide = ss.getSlides().get(3);
assertContains(getSlideText(ss, slide), "POI can read this");
}
}
/**
@ -311,8 +311,15 @@ public class TestXSLFBugs {
ss.close();
}
protected String getSlideText(XSLFSlide slide) {
return XSLFPowerPointExtractor.getText(slide, true, false, false);
protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
// do not auto-close the slideshow
extr.setFilesystem(null);
extr.setSlidesByDefault(true);
extr.setNotesByDefault(false);
extr.setMasterByDefault(false);
return extr.getText(slide);
}
}
@Test
@ -458,7 +465,7 @@ public class TestXSLFBugs {
for (int i = 0; i < slideTexts.length; i++) {
XSLFSlide slide = ss.getSlides().get(i);
assertContains(getSlideText(slide), slideTexts[i]);
assertContains(getSlideText(ss, slide), slideTexts[i]);
}
}

View File

@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIDataSamples;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.xmlbeans.XmlException;
import org.junit.Ignore;
import org.junit.Test;
/**
@ -44,188 +45,189 @@ public class TestXSLFPowerPointExtractor {
/**
* Get text out of the simple file
* @throws XmlException
* @throws OpenXML4JException
*/
@Test
public void testGetSimpleText()
throws IOException, XmlException, OpenXML4JException {
XMLSlideShow xmlA = openPPTX("sample.pptx");
@SuppressWarnings("resource")
OPCPackage pkg = xmlA.getPackage();
public void testGetSimpleText() throws IOException {
try (XMLSlideShow xmlA = openPPTX("sample.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
new XSLFPowerPointExtractor(xmlA).close();
new XSLFPowerPointExtractor(pkg).close();
extractor.getText();
XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xmlA);
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check Basics
assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
assertContains(text, "amet\n\n");
// Check Basics
assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
assertContains(text, "amet\n\n");
// Our placeholder master text
// This shouldn't show up in the output
// String masterText =
// "Click to edit Master title style\n" +
// "Click to edit Master subtitle style\n" +
// "\n\n\n\n\n\n" +
// "Click to edit Master title style\n" +
// "Click to edit Master text styles\n" +
// "Second level\n" +
// "Third level\n" +
// "Fourth level\n" +
// "Fifth level\n";
// Our placeholder master text
// This shouldn't show up in the output
// String masterText =
// "Click to edit Master title style\n" +
// "Click to edit Master subtitle style\n" +
// "\n\n\n\n\n\n" +
// "Click to edit Master title style\n" +
// "Click to edit Master text styles\n" +
// "Second level\n" +
// "Third level\n" +
// "Fourth level\n" +
// "Fifth level\n";
// Just slides, no notes
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
text = extractor.getText();
String slideText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n";
assertEquals(slideText, text);
// Just slides, no notes
text = extractor.getText(true, false, false);
String slideText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n";
assertEquals(slideText, text);
// Just notes, no slides
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text);
// Just notes, no slides
text = extractor.getText(false, true);
assertEquals("\n\n1\n\n\n2\n", text);
// Both
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
text = extractor.getText();
String bothText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n\n\n1\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n\n\n2\n";
assertEquals(bothText, text);
// Both
text = extractor.getText(true, true, false);
String bothText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n\n\n1\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n\n\n2\n";
assertEquals(bothText, text);
// With Slides and Master Text
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
text = extractor.getText();
String smText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n";
assertEquals(smText, text);
// With Slides and Master Text
text = extractor.getText(true, false, true);
String smText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n";
assertEquals(smText, text);
// With Slides, Notes and Master Text
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
text = extractor.getText();
String snmText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n\n\n1\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n\n\n2\n";
assertEquals(snmText, text);
// With Slides, Notes and Master Text
text = extractor.getText(true, true, true);
String snmText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n\n\n1\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n\n\n2\n";
assertEquals(snmText, text);
// Via set defaults
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text);
extractor.close();
xmlA.close();
// Via set defaults
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text);
}
}
@Test
public void testGetComments() throws IOException {
XMLSlideShow xml = openPPTX("45545_Comment.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setCommentsByDefault(true);
String text = extractor.getText();
assertTrue(text.length() > 0);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check comments are there
assertContains(text, "testdoc");
assertContains(text, "test phrase");
// Check comments are there
assertContains(text, "testdoc");
assertContains(text, "test phrase");
// Check the authors came through too
assertContains(text, "XPVMWARE01");
extractor.close();
xml.close();
// Check the authors came through too
assertContains(text, "XPVMWARE01");
}
}
@Test
@Ignore("currently slidelayouts aren't yet supported")
public void testGetMasterText() throws Exception {
XMLSlideShow xml = openPPTX("WithMaster.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check master text is there
assertContains(text, "Footer from the master slide");
String text = extractor.getText();
assertTrue(text.length() > 0);
// Theme text shouldn't show up
// String themeText =
// "Theme Master Title\n" +
// "Theme Master first level\n" +
// "And the 2nd level\n" +
// "Our 3rd level goes here\n" +
// "And onto the 4th, such fun....\n" +
// "Finally is the Fifth level\n";
// Check master text is there
assertContains(text, "Footer from the master slide");
// Check the whole text
String wholeText =
"First page title\n" +
"First page subtitle\n" +
"This is the Master Title\n" +
"This text comes from the Master Slide\n" +
"\n" +
// TODO Detect we didn't have a title, and include the master one
"2nd page subtitle\n" +
"Footer from the master slide\n" +
"This is the Master Title\n" +
"This text comes from the Master Slide\n";
assertEquals(wholeText, text);
// Theme text shouldn't show up
// String themeText =
// "Theme Master Title\n" +
// "Theme Master first level\n" +
// "And the 2nd level\n" +
// "Our 3rd level goes here\n" +
// "And onto the 4th, such fun....\n" +
// "Finally is the Fifth level\n";
extractor.close();
xml.close();
// Check the whole text
String wholeText =
"First page title\n" +
"First page subtitle\n" +
"This is the Master Title\n" +
"This text comes from the Master Slide\n" +
"\n" +
// TODO Detect we didn't have a title, and include the master one
"2nd page subtitle\n" +
"Footer from the master slide\n" +
"This is the Master Title\n" +
"This text comes from the Master Slide\n";
assertEquals(wholeText, text);
}
}
@Test
public void testTable() throws Exception {
XMLSlideShow xml = openPPTX("present1.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("present1.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
assertTrue(text.length() > 0);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check comments are there
assertContains(text, "TEST");
extractor.close();
xml.close();
// Check comments are there
assertContains(text, "TEST");
}
}
/**
@ -241,74 +243,76 @@ public class TestXSLFPowerPointExtractor {
};
for(String extension : extensions) {
String filename = "testPPT." + extension;
XMLSlideShow xml = openPPTX(filename);
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
String text = extractor.getText();
if (extension.equals("thmx")) {
// Theme file doesn't have any textual content
assertEquals(filename, 0, text.length());
continue;
try (XMLSlideShow xml = openPPTX(filename);
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
if (extension.equals("thmx")) {
// Theme file doesn't have any textual content
assertEquals(filename, 0, text.length());
continue;
}
assertTrue(filename, text.length() > 0);
assertContains(filename, text, "Attachment Test");
assertContains(filename, text, "This is a test file data with the same content");
assertContains(filename, text, "content parsing");
assertContains(filename, text, "Different words to test against");
assertContains(filename, text, "Mystery");
}
assertTrue(filename, text.length() > 0);
assertContains(filename, text, "Attachment Test");
assertContains(filename, text, "This is a test file data with the same content");
assertContains(filename, text, "content parsing");
assertContains(filename, text, "Different words to test against");
assertContains(filename, text, "Mystery");
extractor.close();
xml.close();
}
}
@Test
public void test45541() throws Exception {
public void test45541() throws IOException, OpenXML4JException, XmlException {
// extract text from a powerpoint that has a header in the notes-element
POITextExtractor extr = ExtractorFactory.createExtractor(
slTests.getFile("45541_Header.pptx"));
String text = extr.getText();
assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc"));
final File headerFile = slTests.getFile("45541_Header.pptx");
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
String text = extr.getText();
assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc"));
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
assertContains(text, "testdoc");
extr.close();
assertNotNull(text);
extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText();
assertContains(text, "testdoc");
assertNotNull(text);
}
// extract text from a powerpoint that has a footer in the master-slide
extr = ExtractorFactory.createExtractor(
slTests.getFile("45541_Footer.pptx"));
text = extr.getText();
assertNotContained(text, "testdoc");
final File footerFile = slTests.getFile("45541_Footer.pptx");
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
String text = extr.getText();
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
assertNotContained(text, "testdoc");
extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText();
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
assertNotContained(text, "testdoc");
extr.close();
extr.setSlidesByDefault(false);
extr.setNotesByDefault(false);
extr.setMasterByDefault(true);
text = extr.getText();
assertNotContained(text, "testdoc");
}
}
@Test
public void bug54570() throws IOException {
XMLSlideShow xml = openPPTX("bug54570.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
String text = extractor.getText();
assertNotNull(text);
extractor.close();
xml.close();
try (XMLSlideShow xml = openPPTX("bug54570.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
assertNotNull(text);
}
}
private XMLSlideShow openPPTX(String file) throws IOException {
InputStream is = slTests.openResourceAsStream(file);
try {
try (InputStream is = slTests.openResourceAsStream(file)) {
return new XMLSlideShow(is);
} finally {
is.close();
}
}
}

View File

@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
/**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
}
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
return new PowerPointExtractor(poifsDir);
return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
}
if (poifsDir.hasEntry("VisioDocument")) {

View File

@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Removal;
/**
* This class can be used to extract text from a PowerPoint file. Can optionally
@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
*/
@SuppressWarnings("WeakerAccess")
@Deprecated
@Removal(version="5.0.0")
public final class PowerPointExtractor extends POIOLE2TextExtractor {
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;

View File

@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
public void close() throws IOException {
_hslfSlideShow.close();
}
@Override
public Object getPersistDocument() {
return getSlideShowImpl();
}
}

View File

@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Internal;
@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
@Internal
public class HSLFSlideShowFactory extends SlideShowFactory {
/**
* Creates a HSLFSlideShow from the given NPOIFSFileSystem
* <p>Note that in order to properly release resources the
* SlideShow should be closed after use.
* Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
* Note that in order to properly release resources the
* SlideShow should be closed after use.
*/
public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
return new HSLFSlideShow(fs);
}
/**
* Creates a HSLFSlideShow from the given DirectoryNode<p>
* Note that in order to properly release resources the
* SlideShow should be closed after use.
*/
public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
return new HSLFSlideShow(root);
}
}

View File

@ -846,9 +846,13 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
@Override
public void close() throws IOException {
NPOIFSFileSystem fs = getDirectory().getFileSystem();
if (fs != null) {
fs.close();
// only close the filesystem, if we are based on the root node.
// embedded documents/slideshows shouldn't close the parent container
if (getDirectory().getParent() == null) {
NPOIFSFileSystem fs = getDirectory().getFileSystem();
if (fs != null) {
fs.close();
}
}
}

View File

@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ObjectShape;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.IOUtils;
import org.junit.Test;
@ -76,43 +80,46 @@ public final class TestExtractor {
// ppe.close();
// }
private PowerPointExtractor openExtractor(String fileName) throws IOException {
InputStream is = slTests.openResourceAsStream(fileName);
try {
return new PowerPointExtractor(is);
} finally {
is.close();
private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
try (InputStream is = slTests.openResourceAsStream(fileName)) {
return new SlideShowExtractor(SlideShowFactory.create(is));
}
}
@Test
public void testReadSheetText() throws IOException {
// Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
assertEquals(expectText, ppe.getText());
ppe.close();
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
assertEquals(expectText, ppe.getText());
}
// 1 page example with text boxes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
assertEquals(expectText2, ppe2.getText());
ppe2.close();
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
assertEquals(expectText2, ppe.getText());
}
}
@Test
public void testReadNoteText() throws IOException {
// Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
String notesText = ppe.getNotes();
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
assertEquals(expText, notesText);
ppe.close();
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
ppe.setNotesByDefault(true);
ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
assertEquals(expText, notesText);
}
// Other one doesn't have notes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
notesText = ppe2.getNotes();
expText = "";
assertEquals(expText, notesText);
ppe2.close();
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
ppe.setNotesByDefault(true);
ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "";
assertEquals(expText, notesText);
}
}
@Test
@ -126,19 +133,19 @@ public final class TestExtractor {
"\nThese are the notes on page two, again lacking formatting\n"
};
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false);
assertEquals(slText[0] + slText[1], ppe.getText());
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false);
assertEquals(slText[0] + slText[1], ppe.getText());
ppe.setSlidesByDefault(false);
ppe.setNotesByDefault(true);
assertEquals(ntText[0] + ntText[1], ppe.getText());
ppe.setSlidesByDefault(false);
ppe.setNotesByDefault(true);
assertEquals(ntText[0] + ntText[1], ppe.getText());
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(true);
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
ppe.close();
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(true);
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
}
}
/**
@ -149,45 +156,46 @@ public final class TestExtractor {
*/
@Test
public void testMissingCoreRecords() throws IOException {
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false);
String text = ppe.getText();
ppe.setSlidesByDefault(false);
ppe.setNotesByDefault(true);
String nText = ppe.getText();
String text = ppe.getText(true, false);
String nText = ppe.getNotes();
assertNotNull(text);
assertNotNull(nText);
assertNotNull(text);
assertNotNull(nText);
// Notes record were corrupt, so don't expect any
assertEquals(nText.length(), 0);
// Notes record were corrupt, so don't expect any
assertEquals(nText.length(), 0);
// Slide records were fine
assertContains(text, "Using Disease Surveillance and Response");
ppe.close();
// Slide records were fine
assertContains(text, "Using Disease Surveillance and Response");
}
}
@Test
public void testExtractFromEmbeded() throws IOException {
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
POIFSFileSystem fs = new POIFSFileSystem(is);
DirectoryNode root = fs.getRoot();
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
ppe2.close();
ppe1.close();
fs.close();
}
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
throws IOException {
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
final POIFSFileSystem fs = new POIFSFileSystem(is)) {
final DirectoryNode root = fs.getRoot();
// Check the first file
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
PowerPointExtractor ppe = new PowerPointExtractor(ppt);
assertEquals(expected, ppe.getText(true, false));
return ppe;
final String[] TEST_SET = {
"MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
"MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
};
for (int i=0; i<TEST_SET.length; i+=2) {
DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
assertEquals(TEST_SET[i+1], ppe.getText());
}
}
}
}
/**
@ -195,32 +203,32 @@ public final class TestExtractor {
*/
@Test
public void testExtractFromOwnEmbeded() throws IOException {
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
List<HSLFObjectShape> shapes = ppe.getOLEShapes();
assertEquals("Expected 6 ole shapes", 6, shapes.size());
int num_ppt = 0, num_doc = 0, num_xls = 0;
for (HSLFObjectShape ole : shapes) {
String name = ole.getInstanceName();
InputStream data = ole.getObjectData().getInputStream();
if ("Worksheet".equals(name)) {
HSSFWorkbook wb = new HSSFWorkbook(data);
num_xls++;
wb.close();
} else if ("Document".equals(name)) {
HWPFDocument doc = new HWPFDocument(data);
num_doc++;
doc.close();
} else if ("Presentation".equals(name)) {
num_ppt++;
HSLFSlideShow ppt = new HSLFSlideShow(data);
ppt.close();
try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
List<? extends ObjectShape> shapes = ppe.getOLEShapes();
assertEquals("Expected 6 ole shapes", 6, shapes.size());
int num_ppt = 0, num_doc = 0, num_xls = 0;
for (ObjectShape ole : shapes) {
String name = ((HSLFObjectShape)ole).getInstanceName();
InputStream data = ole.getObjectData().getInputStream();
if ("Worksheet".equals(name)) {
HSSFWorkbook wb = new HSSFWorkbook(data);
num_xls++;
wb.close();
} else if ("Document".equals(name)) {
HWPFDocument doc = new HWPFDocument(data);
num_doc++;
doc.close();
} else if ("Presentation".equals(name)) {
num_ppt++;
HSLFSlideShow ppt = new HSLFSlideShow(data);
ppt.close();
}
data.close();
}
data.close();
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
}
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
ppe.close();
}
/**
@ -228,11 +236,11 @@ public final class TestExtractor {
*/
@Test
public void test52991() throws IOException {
PowerPointExtractor ppe = openExtractor("badzip.ppt");
for (HSLFObjectShape shape : ppe.getOLEShapes()) {
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
for (ObjectShape shape : ppe.getOLEShapes()) {
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
}
}
ppe.close();
}
/**
@ -240,27 +248,27 @@ public final class TestExtractor {
*/
@Test
public void testWithComments() throws IOException {
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
String text = ppe1.getText();
assertFalse("Comments not in by default", text.contains("This is a test comment"));
try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("This is a test comment"));
ppe1.setCommentsByDefault(true);
ppe.setCommentsByDefault(true);
text = ppe1.getText();
assertContains(text, "This is a test comment");
ppe1.close();
text = ppe.getText();
assertContains(text, "This is a test comment");
}
// And another file
PowerPointExtractor ppe2 = openExtractor("45543.ppt");
text = ppe2.getText();
assertFalse("Comments not in by default", text.contains("testdoc"));
try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("testdoc"));
ppe2.setCommentsByDefault(true);
ppe.setCommentsByDefault(true);
text = ppe2.getText();
assertContains(text, "testdoc");
ppe2.close();
text = ppe.getText();
assertContains(text, "testdoc");
}
}
/**
@ -268,48 +276,37 @@ public final class TestExtractor {
*/
@Test
public void testHeaderFooter() throws IOException {
String text;
// With a header on the notes
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
is1.close();
assertNotNull(ppt1.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
HSLFSlideShow ppt = new HSLFSlideShow(is)) {
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
text = ppe1.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe1.setNotesByDefault(true);
text = ppe1.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
ppe1.close();
ppt1.close();
testHeaderFooterInner(ppt);
}
// And with a footer, also on notes
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
is2.close();
assertNotNull(ppt2.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
ppt2.close();
try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
testHeaderFooterInner(ppt);
}
}
text = ppe2.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
String text = ppe.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe2.setNotesByDefault(true);
text = ppe2.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
ppe2.close();
ppe.setNotesByDefault(true);
text = ppe.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
}
}
@SuppressWarnings("unused")
@ -318,41 +315,40 @@ public final class TestExtractor {
String masterTitleText = "This is the Master Title";
String masterRandomText = "This text comes from the Master Slide";
String masterFooterText = "Footer from the master slide";
PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
ppe.setMasterByDefault(true);
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, masterRandomText);
assertContains(text, masterFooterText);
ppe.close();
String text = ppe.getText();
assertContains(text, masterRandomText);
assertContains(text, masterFooterText);
}
}
@Test
public void testMasterText() throws IOException {
PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
// Initially not there
String text = ppe.getText();
assertFalse(text.contains("Text that I added to the master slide"));
// Initially not there
String text = ppe1.getText();
assertFalse(text.contains("Text that I added to the master slide"));
// Enable, shows up
ppe.setMasterByDefault(true);
text = ppe.getText();
assertContains(text, "Text that I added to the master slide");
// Enable, shows up
ppe1.setMasterByDefault(true);
text = ppe1.getText();
assertContains(text, "Text that I added to the master slide");
// Make sure placeholder text does not come out
assertNotContained(text, "Click to edit Master");
ppe1.close();
// Make sure placeholder text does not come out
assertNotContained(text, "Click to edit Master");
}
// Now with another file only containing master text
// Will always show up
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
String masterText = "Footer from the master slide";
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
String masterText = "Footer from the master slide";
text = ppe2.getText();
assertContainsIgnoreCase(text, "master");
assertContains(text, masterText);
ppe2.close();
String text = ppe.getText();
assertContainsIgnoreCase(text, "master");
assertContains(text, masterText);
}
}
/**
@ -360,22 +356,21 @@ public final class TestExtractor {
*/
@Test
public void testChineseText() throws IOException {
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
String text = ppe.getText();
String text = ppe.getText();
// Check for the english text line
assertContains(text, "Single byte");
// Check for the english text line
assertContains(text, "Single byte");
// Check for the english text in the mixed line
assertContains(text, "Mix");
// Check for the english text in the mixed line
assertContains(text, "Mix");
// Check for the chinese text in the mixed line
assertContains(text, "\u8868");
// Check for the chinese text in the mixed line
assertContains(text, "\u8868");
// Check for the chinese only text line
assertContains(text, "\uff8a\uff9d\uff76\uff78");
ppe.close();
// Check for the chinese only text line
assertContains(text, "\uff8a\uff9d\uff76\uff78");
}
}
/**
@ -387,67 +382,59 @@ public final class TestExtractor {
public void testDifferentPOIFS() throws IOException {
// Open the two filesystems
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
InputStream is1 = new FileInputStream(pptFile);
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
is1.close();
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
try (final InputStream is1 = new FileInputStream(pptFile);
final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
// Open directly
for (DirectoryNode dir : files) {
PowerPointExtractor extractor = new PowerPointExtractor(dir);
assertEquals(expectText, extractor.getText());
final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
// Open directly
for (DirectoryNode dir : files) {
try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
assertEquals(expectText, extractor.getText());
}
}
}
// Open via a HSLFSlideShow
for (DirectoryNode dir : files) {
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
assertEquals(expectText, extractor.getText());
extractor.close();
slideshow.close();
}
npoifs.close();
}
@Test
public void testTable() throws Exception {
PowerPointExtractor ppe1 = openExtractor("54111.ppt");
String text1 = ppe1.getText();
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
assertContains(text1, target1);
ppe1.close();
try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
String text = ppe.getText();
String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
assertContains(text, target);
}
PowerPointExtractor ppe2 = openExtractor("54722.ppt");
String text2 = ppe2.getText();
try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
String text = ppe.getText();
String target2 = "this\tText\tis\twithin\ta\n" +
"table\t1\t2\t3\t4";
assertContains(text2, target2);
ppe2.close();
String target = "this\tText\tis\twithin\ta\n" +
"table\t1\t2\t3\t4";
assertContains(text, target);
}
}
// bug 60003
@Test
public void testExtractMasterSlideFooterText() throws Exception {
PowerPointExtractor ppe = openExtractor("60003.ppt");
ppe.setMasterByDefault(true);
try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, "Prague");
ppe.close();
String text = ppe.getText();
assertContains(text, "Prague");
}
}
@Test
public void testExtractGroupedShapeText() throws Exception {
try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
final String text = ppe.getText();
//this tests that we're ignoring text shapes at depth=0

View File

@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.draw.DrawTextParagraph;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ColorStyle;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
@ -800,18 +801,18 @@ public final class TestBugs {
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
for (String f : files) {
File sample = HSLFTestDataSamples.getSampleFile(f);
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
assertNotNull(ex.getText());
ex.close();
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText());
}
}
}
@Test
public void bug58733() throws IOException {
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
assertNotNull(ex.getText());
ex.close();
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText());
}
}
@Test

Binary file not shown.