From 1d38f5c329a735338bb9a7c0a1a21ff908a9ddfa Mon Sep 17 00:00:00 2001 From: Andreas Beeker Date: Sat, 8 Oct 2016 18:08:25 +0000 Subject: [PATCH] Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68 --- .../hslf/extractor/PowerPointExtractor.java | 63 ++++++++++++++----- .../apache/poi/hslf/model/HSLFMetroShape.java | 20 +++--- .../poi/hslf/extractor/TestExtractor.java | 16 ++++- 3 files changed, 74 insertions(+), 25 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index 44aa8a2d3..8085482ec 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -17,21 +17,43 @@ package org.apache.poi.hslf.extractor; -import java.io.*; -import java.util.*; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import org.apache.poi.POIOLE2TextExtractor; -import org.apache.poi.hslf.model.*; -import org.apache.poi.hslf.usermodel.*; -import org.apache.poi.poifs.filesystem.*; +import org.apache.poi.hslf.model.Comment; +import org.apache.poi.hslf.model.HSLFMetroShape; +import org.apache.poi.hslf.model.HeadersFooters; +import org.apache.poi.hslf.model.OLEShape; +import org.apache.poi.hslf.usermodel.HSLFMasterSheet; +import org.apache.poi.hslf.usermodel.HSLFNotes; +import org.apache.poi.hslf.usermodel.HSLFShape; +import org.apache.poi.hslf.usermodel.HSLFSlide; +import org.apache.poi.hslf.usermodel.HSLFSlideMaster; +import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl; +import org.apache.poi.hslf.usermodel.HSLFTable; +import org.apache.poi.hslf.usermodel.HSLFTableCell; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; +import org.apache.poi.hslf.usermodel.HSLFTextShape; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * This class can be used to extract text from a PowerPoint file. Can optionally * also get the notes from one. - * - * @author Nick Burch */ public final class PowerPointExtractor extends POIOLE2TextExtractor { + private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class); + private final HSLFSlideShowImpl _hslfshow; private final HSLFSlideShow _show; private final List _slides; @@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { for (HSLFSlideMaster master : _show.getSlideMasters()) { for(HSLFShape sh : master.getShapes()){ if(sh instanceof HSLFTextShape){ - if(HSLFMasterSheet.isPlaceholder(sh)) { - // don't bother about boiler - // plate text on master - // sheets + HSLFTextShape hsh = (HSLFTextShape)sh; + final String text = hsh.getText(); + if (text == null || "".equals(text) || "*".equals(text)) { continue; } - HSLFTextShape tsh = (HSLFTextShape)sh; - String text = tsh.getText(); - if (text != null){ - ret.append(text); - if (!text.endsWith("\n")) { - ret.append("\n"); + + if (HSLFMasterSheet.isPlaceholder(sh)) { + // check for metro shape of complex placeholder + boolean isMetro = new HSLFMetroShape(sh).hasMetroBlob(); + + if (!isMetro) { + // don't bother about boiler plate text on master sheets + LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text); + continue; } } + + ret.append(text); + if (!text.endsWith("\n")) { + ret.append("\n"); + } } } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/model/HSLFMetroShape.java b/src/scratchpad/src/org/apache/poi/hslf/model/HSLFMetroShape.java index e5d9f93a9..894818eb4 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/model/HSLFMetroShape.java +++ b/src/scratchpad/src/org/apache/poi/hslf/model/HSLFMetroShape.java @@ -47,14 +47,20 @@ public class HSLFMetroShape> { * @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream */ public byte[] getMetroBytes() { + EscherComplexProperty ep = getMetroProp(); + return (ep == null) ? null : ep.getComplexData(); + } + + /** + * @return if there's a metro blob to extract + */ + public boolean hasMetroBlob() { + return getMetroProp() != null; + } + + private EscherComplexProperty getMetroProp() { AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID); - if (opt != null) { - EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB); - if (ep != null) { - return ep.getComplexData(); - } - } - return null; + return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB); } /** diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java index 5e6b71928..0cf408a8f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java @@ -431,5 +431,19 @@ public final class TestExtractor { String target = "this\tText\tis\twithin\ta\n"+ "table\t1\t2\t3\t4"; assertTrue(text.contains(target)); - } + } + + // bug 60003 + @Test + public void testExtractMasterSlideFooterText() throws Exception { + HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt")); + ppe.close(); + + ppe = new PowerPointExtractor(hslf); + ppe.setMasterByDefault(true); + + String text = ppe.getText(); + assertContains(text, "Prague"); + hslf.close(); + } }