Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2016-10-08 18:08:25 +00:00
parent cb03495d36
commit 1d38f5c329
3 changed files with 74 additions and 25 deletions

View File

@ -17,21 +17,43 @@
package org.apache.poi.hslf.extractor;
import java.io.*;
import java.util.*;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hslf.model.*;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HSLFMetroShape;
import org.apache.poi.hslf.model.HeadersFooters;
import org.apache.poi.hslf.model.OLEShape;
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
import org.apache.poi.hslf.usermodel.HSLFNotes;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideMaster;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hslf.usermodel.HSLFTable;
import org.apache.poi.hslf.usermodel.HSLFTableCell;
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* This class can be used to extract text from a PowerPoint file. Can optionally
* also get the notes from one.
*
* @author Nick Burch
*/
public final class PowerPointExtractor extends POIOLE2TextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class);
private final HSLFSlideShowImpl _hslfshow;
private final HSLFSlideShow _show;
private final List<HSLFSlide> _slides;
@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
for (HSLFSlideMaster master : _show.getSlideMasters()) {
for(HSLFShape sh : master.getShapes()){
if(sh instanceof HSLFTextShape){
if(HSLFMasterSheet.isPlaceholder(sh)) {
// don't bother about boiler
// plate text on master
// sheets
HSLFTextShape hsh = (HSLFTextShape)sh;
final String text = hsh.getText();
if (text == null || "".equals(text) || "*".equals(text)) {
continue;
}
HSLFTextShape tsh = (HSLFTextShape)sh;
String text = tsh.getText();
if (text != null){
ret.append(text);
if (!text.endsWith("\n")) {
ret.append("\n");
if (HSLFMasterSheet.isPlaceholder(sh)) {
// check for metro shape of complex placeholder
boolean isMetro = new HSLFMetroShape<HSLFShape>(sh).hasMetroBlob();
if (!isMetro) {
// don't bother about boiler plate text on master sheets
LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
continue;
}
}
ret.append(text);
if (!text.endsWith("\n")) {
ret.append("\n");
}
}
}
}

View File

@ -47,14 +47,20 @@ public class HSLFMetroShape<T extends Shape<?,?>> {
* @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream
*/
public byte[] getMetroBytes() {
EscherComplexProperty ep = getMetroProp();
return (ep == null) ? null : ep.getComplexData();
}
/**
* @return if there's a metro blob to extract
*/
public boolean hasMetroBlob() {
return getMetroProp() != null;
}
private EscherComplexProperty getMetroProp() {
AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);
if (opt != null) {
EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
if (ep != null) {
return ep.getComplexData();
}
}
return null;
return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
}
/**

View File

@ -431,5 +431,19 @@ public final class TestExtractor {
String target = "this\tText\tis\twithin\ta\n"+
"table\t1\t2\t3\t4";
assertTrue(text.contains(target));
}
}
// bug 60003
@Test
public void testExtractMasterSlideFooterText() throws Exception {
HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt"));
ppe.close();
ppe = new PowerPointExtractor(hslf);
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, "Prague");
hslf.close();
}
}