XLSF text extraction improvements relating to TIKA-712

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-09-26 14:37:50 +00:00
parent 8de1ce7c6a
commit 9e1ca2caea
5 changed files with 139 additions and 40 deletions

View File

@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.usermodel.DrawingParagraph; import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.DrawingTextBody;
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors; import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments; import org.apache.poi.xslf.usermodel.XSLFComments;
@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFNotes; import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideMaster; import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment; import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
try { try {
XSLFNotes notes = slide.getNotes(); XSLFNotes notes = slide.getNotes();
XSLFComments comments = slide.getComments(); XSLFComments comments = slide.getComments();
XSLFSlideLayout layout = slide.getSlideLayout();
XSLFSlideMaster master = slide.getMasterSheet(); XSLFSlideMaster master = slide.getMasterSheet();
// TODO Do the slide's name // TODO Do the slide's name
@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
// Do the slide's text if requested // Do the slide's text if requested
if (slideText) { if (slideText) {
extractText(slide.getCommonSlideData(), text); extractText(slide.getCommonSlideData(), false, text);
// If there's a master sheet and it's requested, grab text from there // If requested, get text from the master and it's layout
if(masterText && master != null) { if(masterText) {
extractText(master.getCommonSlideData(), text); if(layout != null) {
extractText(layout.getCommonSlideData(), true, text);
}
if(master != null) {
extractText(master.getCommonSlideData(), true, text);
}
} }
// If the slide has comments, do those too // If the slide has comments, do those too
@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
// Do the notes if requested // Do the notes if requested
if (notesText && notes != null) { if (notesText && notes != null) {
extractText(notes.getCommonSlideData(), text); extractText(notes.getCommonSlideData(), false, text);
} }
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
return text.toString(); return text.toString();
} }
private void extractText(XSLFCommonSlideData data, StringBuffer text) { private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
for (DrawingParagraph p : data.getText()) { for(DrawingTextBody textBody : data.getDrawingText()) {
if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
if(! ph.isPlaceholderCustom()) {
// Skip non-customised placeholder text
continue;
}
}
for (DrawingParagraph p : textBody.getParagraphs()) {
text.append(p.getText()); text.append(p.getText());
text.append("\n"); text.append("\n");
} }
} }
}
} }

View File

@ -17,11 +17,11 @@
package org.apache.poi.xslf.usermodel; package org.apache.poi.xslf.usermodel;
import java.util.List;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import java.util.List;
public class DrawingTextBody { public class DrawingTextBody {
private final CTTextBody textBody; private final CTTextBody textBody;

View File

@ -0,0 +1,57 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
/**
* A {@link DrawingTextBody} which is a placeholder
* @author nick
*
*/
public class DrawingTextPlaceholder extends DrawingTextBody {
private final CTPlaceholder placeholder;
public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
super(textBody);
this.placeholder = placeholder;
}
/**
* What kind of placeholder is this?
*/
public String getPlaceholderType() {
return placeholder.getType().toString();
}
/**
* What kind of placeholder is this?
*/
public STPlaceholderType.Enum getPlaceholderTypeEnum() {
return placeholder.getType();
}
/**
* Is the PlaceHolder text customised?
*/
public boolean isPlaceholderCustom() {
return placeholder.getHasCustomPrompt();
}
}

View File

@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData; import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable; import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData; import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame; import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
@ -43,10 +44,10 @@ public class XSLFCommonSlideData {
this.data = data; this.data = data;
} }
public List<DrawingParagraph> getText() { public List<DrawingTextBody> getDrawingText() {
CTGroupShape gs = data.getSpTree(); CTGroupShape gs = data.getSpTree();
List<DrawingParagraph> out = new ArrayList<DrawingParagraph>(); List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();
processShape(gs, out); processShape(gs, out);
@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
for (DrawingTableRow row : table.getRows()) { for (DrawingTableRow row : table.getRows()) {
for (DrawingTableCell cell : row.getCells()) { for (DrawingTableCell cell : row.getCells()) {
DrawingTextBody textBody = cell.getTextBody(); DrawingTextBody textBody = cell.getTextBody();
out.add(textBody);
out.addAll(Arrays.asList(textBody.getParagraphs()));
} }
} }
} }
@ -89,19 +89,31 @@ public class XSLFCommonSlideData {
return out; return out;
} }
public List<DrawingParagraph> getText() {
List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
for(DrawingTextBody textBody : getDrawingText()) {
paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
}
return paragraphs;
}
private void processShape(CTGroupShape gs, List<DrawingParagraph> out) { private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
List<CTShape> shapes = gs.getSpList(); List<CTShape> shapes = gs.getSpList();
for (int i = 0; i < shapes.size(); i++) { for (CTShape shape : shapes) {
CTTextBody ctTextBody = shapes.get(i).getTxBody(); CTTextBody ctTextBody = shape.getTxBody();
if (ctTextBody==null) { if (ctTextBody==null) {
continue; continue;
} }
DrawingTextBody textBody = new DrawingTextBody(ctTextBody); DrawingTextBody textBody;
CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr();
out.addAll(Arrays.asList(textBody.getParagraphs())); if(nvpr.isSetPh()) {
} textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
} else {
textBody = new DrawingTextBody(ctTextBody);
} }
out.add(textBody);
}
}
} }

View File

@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
assertTrue(text.contains("amet\n\n")); assertTrue(text.contains("amet\n\n"));
// Our master text, for tests // Our placeholder master text
// This shouldn't show up in the output
String masterText = String masterText =
"Click to edit Master title style\n" + "Click to edit Master title style\n" +
"Click to edit Master subtitle style\n" +
"\n\n\n\n\n\n" +
"Click to edit Master title style\n" +
"Click to edit Master text styles\n" + "Click to edit Master text styles\n" +
"Second level\n" + "Second level\n" +
"Third level\n" + "Third level\n" +
@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" + "\n" +
masterText +
"\n\n\n" +
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Lorem\n" + "Lorem\n" +
"ipsum\n" + "ipsum\n" +
"dolor\n" + "dolor\n" +
"sit\n" + "sit\n" +
"amet\n" + "amet\n" +
"\n" + "\n"
masterText +
"\n\n\n"
, text , text
); );
@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" + "\n" +
masterText + "\n\n" +
"\n\n\n\n\n" +
"Lorem ipsum dolor sit amet\n" + "Lorem ipsum dolor sit amet\n" +
"Lorem\n" + "Lorem\n" +
"ipsum\n" + "ipsum\n" +
"dolor\n" + "dolor\n" +
"sit\n" + "sit\n" +
"amet\n" + "amet\n" +
"\n" + "\n\n\n"
masterText +
"\n\n\n\n\n"
, text , text
); );
@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx"))); new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
XSLFPowerPointExtractor extractor = XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xml); new XSLFPowerPointExtractor(xml);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
String text = extractor.getText(); String text = extractor.getText();
assertTrue(text.length() > 0); assertTrue(text.length() > 0);
@ -184,16 +184,27 @@ public class TestXSLFPowerPointExtractor extends TestCase {
assertTrue("Unable to find expected word in text\n" + text, assertTrue("Unable to find expected word in text\n" + text,
text.contains("Footer from the master slide")); text.contains("Footer from the master slide"));
// Theme text shouldn't show up
String themeText =
"Theme Master Title\n" +
"Theme Master first level\n" +
"And the 2nd level\n" +
"Our 3rd level goes here\n" +
"And onto the 4th, such fun….\n" +
"Finally is the Fifth level\n";
// Check the whole text // Check the whole text
assertEquals( assertEquals(
"First page title\n" + "First page title\n" +
"First page subtitle\n" + "First page subtitle\n" +
// "This text comes from the Master Slide\n" + // TODO "This is the Master Title\n" +
// "This is the Master Title\n" + // TODO "This text comes from the Master Slide\n" +
"\n" + // TODO Should be the above "\n" +
// TODO Detect we didn't have a title, and include the master one
"2nd page subtitle\n" + "2nd page subtitle\n" +
// "This text comes from the Master Slide\n" + // TODO "Footer from the master slide\n" +
"Footer from the master slide\n" "This is the Master Title\n" +
"This text comes from the Master Slide\n"
, text , text
); );
} }