XLSF text extraction improvements relating to TIKA-712

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-09-26 14:37:50 +00:00
parent 8de1ce7c6a
commit 9e1ca2caea
5 changed files with 139 additions and 40 deletions

View File

@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.DrawingTextBody;
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments;
@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
try {
XSLFNotes notes = slide.getNotes();
XSLFComments comments = slide.getComments();
XSLFSlideLayout layout = slide.getSlideLayout();
XSLFSlideMaster master = slide.getMasterSheet();
// TODO Do the slide's name
@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
// Do the slide's text if requested
if (slideText) {
extractText(slide.getCommonSlideData(), text);
extractText(slide.getCommonSlideData(), false, text);
// If there's a master sheet and it's requested, grab text from there
if(masterText && master != null) {
extractText(master.getCommonSlideData(), text);
// If requested, get text from the master and it's layout
if(masterText) {
if(layout != null) {
extractText(layout.getCommonSlideData(), true, text);
}
if(master != null) {
extractText(master.getCommonSlideData(), true, text);
}
}
// If the slide has comments, do those too
@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
// Do the notes if requested
if (notesText && notes != null) {
extractText(notes.getCommonSlideData(), text);
extractText(notes.getCommonSlideData(), false, text);
}
} catch (Exception e) {
throw new RuntimeException(e);
@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
return text.toString();
}
private void extractText(XSLFCommonSlideData data, StringBuffer text) {
for (DrawingParagraph p : data.getText()) {
private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
for(DrawingTextBody textBody : data.getDrawingText()) {
if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
if(! ph.isPlaceholderCustom()) {
// Skip non-customised placeholder text
continue;
}
}
for (DrawingParagraph p : textBody.getParagraphs()) {
text.append(p.getText());
text.append("\n");
}
}
}
}
}
}

View File

@ -17,16 +17,16 @@
package org.apache.poi.xslf.usermodel;
import java.util.List;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import java.util.List;
public class DrawingTextBody {
private final CTTextBody textBody;
public DrawingTextBody(CTTextBody textBody) {
this.textBody = textBody;
this.textBody = textBody;
}
public DrawingParagraph[] getParagraphs() {

View File

@ -0,0 +1,57 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
/**
* A {@link DrawingTextBody} which is a placeholder
* @author nick
*
*/
public class DrawingTextPlaceholder extends DrawingTextBody {
private final CTPlaceholder placeholder;
public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
super(textBody);
this.placeholder = placeholder;
}
/**
* What kind of placeholder is this?
*/
public String getPlaceholderType() {
return placeholder.getType().toString();
}
/**
* What kind of placeholder is this?
*/
public STPlaceholderType.Enum getPlaceholderTypeEnum() {
return placeholder.getType();
}
/**
* Is the PlaceHolder text customised?
*/
public boolean isPlaceholderCustom() {
return placeholder.getHasCustomPrompt();
}
}

View File

@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
@ -42,11 +43,11 @@ public class XSLFCommonSlideData {
public XSLFCommonSlideData(CTCommonSlideData data) {
this.data = data;
}
public List<DrawingParagraph> getText() {
public List<DrawingTextBody> getDrawingText() {
CTGroupShape gs = data.getSpTree();
List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();
processShape(gs, out);
@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
for (DrawingTableRow row : table.getRows()) {
for (DrawingTableCell cell : row.getCells()) {
DrawingTextBody textBody = cell.getTextBody();
out.addAll(Arrays.asList(textBody.getParagraphs()));
out.add(textBody);
}
}
}
@ -89,19 +89,31 @@ public class XSLFCommonSlideData {
return out;
}
public List<DrawingParagraph> getText() {
List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
for(DrawingTextBody textBody : getDrawingText()) {
paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
}
return paragraphs;
}
private void processShape(CTGroupShape gs, List<DrawingParagraph> out) {
private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
List<CTShape> shapes = gs.getSpList();
for (int i = 0; i < shapes.size(); i++) {
CTTextBody ctTextBody = shapes.get(i).getTxBody();
for (CTShape shape : shapes) {
CTTextBody ctTextBody = shape.getTxBody();
if (ctTextBody==null) {
continue;
}
DrawingTextBody textBody;
CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr();
if(nvpr.isSetPh()) {
textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
} else {
textBody = new DrawingTextBody(ctTextBody);
}
DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
out.addAll(Arrays.asList(textBody.getParagraphs()));
out.add(textBody);
}
}
}

View File

@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
assertTrue(text.contains("amet\n\n"));
// Our master text, for tests
// Our placeholder master text
// This shouldn't show up in the output
String masterText =
"Click to edit Master title style\n" +
"Click to edit Master subtitle style\n" +
"\n\n\n\n\n\n" +
"Click to edit Master title style\n" +
"Click to edit Master text styles\n" +
"Second level\n" +
"Third level\n" +
@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
masterText +
"\n\n\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n" +
masterText +
"\n\n\n"
"\n"
, text
);
@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
masterText +
"\n\n\n\n\n" +
"\n\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n" +
masterText +
"\n\n\n\n\n"
"\n\n\n"
, text
);
@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xml);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
String text = extractor.getText();
assertTrue(text.length() > 0);
@ -183,17 +183,28 @@ public class TestXSLFPowerPointExtractor extends TestCase {
// Check master text is there
assertTrue("Unable to find expected word in text\n" + text,
text.contains("Footer from the master slide"));
// Theme text shouldn't show up
String themeText =
"Theme Master Title\n" +
"Theme Master first level\n" +
"And the 2nd level\n" +
"Our 3rd level goes here\n" +
"And onto the 4th, such fun….\n" +
"Finally is the Fifth level\n";
// Check the whole text
assertEquals(
"First page title\n" +
"First page subtitle\n" +
// "This text comes from the Master Slide\n" + // TODO
// "This is the Master Title\n" + // TODO
"\n" + // TODO Should be the above
"This is the Master Title\n" +
"This text comes from the Master Slide\n" +
"\n" +
// TODO Detect we didn't have a title, and include the master one
"2nd page subtitle\n" +
// "This text comes from the Master Slide\n" + // TODO
"Footer from the master slide\n"
"Footer from the master slide\n" +
"This is the Master Title\n" +
"This text comes from the Master Slide\n"
, text
);
}