XLSF text extraction improvements relating to TIKA-712
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8de1ce7c6a
commit
9e1ca2caea
@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xslf.XSLFSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.DrawingParagraph;
|
||||
import org.apache.poi.xslf.usermodel.DrawingTextBody;
|
||||
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
|
||||
import org.apache.poi.xslf.usermodel.XSLFComments;
|
||||
@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
|
||||
import org.apache.poi.xslf.usermodel.XSLFNotes;
|
||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
|
||||
@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
try {
|
||||
XSLFNotes notes = slide.getNotes();
|
||||
XSLFComments comments = slide.getComments();
|
||||
XSLFSlideLayout layout = slide.getSlideLayout();
|
||||
XSLFSlideMaster master = slide.getMasterSheet();
|
||||
|
||||
// TODO Do the slide's name
|
||||
@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
|
||||
// Do the slide's text if requested
|
||||
if (slideText) {
|
||||
extractText(slide.getCommonSlideData(), text);
|
||||
extractText(slide.getCommonSlideData(), false, text);
|
||||
|
||||
// If there's a master sheet and it's requested, grab text from there
|
||||
if(masterText && master != null) {
|
||||
extractText(master.getCommonSlideData(), text);
|
||||
// If requested, get text from the master and it's layout
|
||||
if(masterText) {
|
||||
if(layout != null) {
|
||||
extractText(layout.getCommonSlideData(), true, text);
|
||||
}
|
||||
if(master != null) {
|
||||
extractText(master.getCommonSlideData(), true, text);
|
||||
}
|
||||
}
|
||||
|
||||
// If the slide has comments, do those too
|
||||
@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
|
||||
// Do the notes if requested
|
||||
if (notesText && notes != null) {
|
||||
extractText(notes.getCommonSlideData(), text);
|
||||
extractText(notes.getCommonSlideData(), false, text);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
private void extractText(XSLFCommonSlideData data, StringBuffer text) {
|
||||
for (DrawingParagraph p : data.getText()) {
|
||||
private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
|
||||
for(DrawingTextBody textBody : data.getDrawingText()) {
|
||||
if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
|
||||
DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
|
||||
if(! ph.isPlaceholderCustom()) {
|
||||
// Skip non-customised placeholder text
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (DrawingParagraph p : textBody.getParagraphs()) {
|
||||
text.append(p.getText());
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,16 +17,16 @@
|
||||
|
||||
package org.apache.poi.xslf.usermodel;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DrawingTextBody {
|
||||
private final CTTextBody textBody;
|
||||
|
||||
public DrawingTextBody(CTTextBody textBody) {
|
||||
this.textBody = textBody;
|
||||
this.textBody = textBody;
|
||||
}
|
||||
|
||||
public DrawingParagraph[] getParagraphs() {
|
||||
|
@ -0,0 +1,57 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.xslf.usermodel;
|
||||
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
|
||||
|
||||
/**
|
||||
* A {@link DrawingTextBody} which is a placeholder
|
||||
* @author nick
|
||||
*
|
||||
*/
|
||||
public class DrawingTextPlaceholder extends DrawingTextBody {
|
||||
private final CTPlaceholder placeholder;
|
||||
|
||||
public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
|
||||
super(textBody);
|
||||
this.placeholder = placeholder;
|
||||
}
|
||||
|
||||
/**
|
||||
* What kind of placeholder is this?
|
||||
*/
|
||||
public String getPlaceholderType() {
|
||||
return placeholder.getType().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* What kind of placeholder is this?
|
||||
*/
|
||||
public STPlaceholderType.Enum getPlaceholderTypeEnum() {
|
||||
return placeholder.getType();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the PlaceHolder text customised?
|
||||
*/
|
||||
public boolean isPlaceholderCustom() {
|
||||
return placeholder.getHasCustomPrompt();
|
||||
}
|
||||
}
|
@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
||||
@ -42,11 +43,11 @@ public class XSLFCommonSlideData {
|
||||
public XSLFCommonSlideData(CTCommonSlideData data) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public List<DrawingParagraph> getText() {
|
||||
|
||||
public List<DrawingTextBody> getDrawingText() {
|
||||
CTGroupShape gs = data.getSpTree();
|
||||
|
||||
List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
|
||||
List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();
|
||||
|
||||
processShape(gs, out);
|
||||
|
||||
@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
|
||||
for (DrawingTableRow row : table.getRows()) {
|
||||
for (DrawingTableCell cell : row.getCells()) {
|
||||
DrawingTextBody textBody = cell.getTextBody();
|
||||
|
||||
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||
out.add(textBody);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -89,19 +89,31 @@ public class XSLFCommonSlideData {
|
||||
|
||||
return out;
|
||||
}
|
||||
public List<DrawingParagraph> getText() {
|
||||
List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
|
||||
for(DrawingTextBody textBody : getDrawingText()) {
|
||||
paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||
}
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
private void processShape(CTGroupShape gs, List<DrawingParagraph> out) {
|
||||
private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
|
||||
List<CTShape> shapes = gs.getSpList();
|
||||
for (int i = 0; i < shapes.size(); i++) {
|
||||
CTTextBody ctTextBody = shapes.get(i).getTxBody();
|
||||
for (CTShape shape : shapes) {
|
||||
CTTextBody ctTextBody = shape.getTxBody();
|
||||
if (ctTextBody==null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
DrawingTextBody textBody;
|
||||
CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr();
|
||||
if(nvpr.isSetPh()) {
|
||||
textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
|
||||
} else {
|
||||
textBody = new DrawingTextBody(ctTextBody);
|
||||
}
|
||||
|
||||
DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
|
||||
|
||||
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||
out.add(textBody);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
|
||||
assertTrue(text.contains("amet\n\n"));
|
||||
|
||||
// Our master text, for tests
|
||||
// Our placeholder master text
|
||||
// This shouldn't show up in the output
|
||||
String masterText =
|
||||
"Click to edit Master title style\n" +
|
||||
"Click to edit Master subtitle style\n" +
|
||||
"\n\n\n\n\n\n" +
|
||||
"Click to edit Master title style\n" +
|
||||
"Click to edit Master text styles\n" +
|
||||
"Second level\n" +
|
||||
"Third level\n" +
|
||||
@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
"\n" +
|
||||
masterText +
|
||||
"\n\n\n" +
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Lorem\n" +
|
||||
"ipsum\n" +
|
||||
"dolor\n" +
|
||||
"sit\n" +
|
||||
"amet\n" +
|
||||
"\n" +
|
||||
masterText +
|
||||
"\n\n\n"
|
||||
"\n"
|
||||
, text
|
||||
);
|
||||
|
||||
@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
"\n" +
|
||||
masterText +
|
||||
"\n\n\n\n\n" +
|
||||
"\n\n" +
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Lorem\n" +
|
||||
"ipsum\n" +
|
||||
"dolor\n" +
|
||||
"sit\n" +
|
||||
"amet\n" +
|
||||
"\n" +
|
||||
masterText +
|
||||
"\n\n\n\n\n"
|
||||
"\n\n\n"
|
||||
, text
|
||||
);
|
||||
|
||||
@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
|
||||
XSLFPowerPointExtractor extractor =
|
||||
new XSLFPowerPointExtractor(xml);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(false);
|
||||
extractor.setMasterByDefault(true);
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
@ -183,17 +183,28 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||
// Check master text is there
|
||||
assertTrue("Unable to find expected word in text\n" + text,
|
||||
text.contains("Footer from the master slide"));
|
||||
|
||||
// Theme text shouldn't show up
|
||||
String themeText =
|
||||
"Theme Master Title\n" +
|
||||
"Theme Master first level\n" +
|
||||
"And the 2nd level\n" +
|
||||
"Our 3rd level goes here\n" +
|
||||
"And onto the 4th, such fun….\n" +
|
||||
"Finally is the Fifth level\n";
|
||||
|
||||
// Check the whole text
|
||||
assertEquals(
|
||||
"First page title\n" +
|
||||
"First page subtitle\n" +
|
||||
// "This text comes from the Master Slide\n" + // TODO
|
||||
// "This is the Master Title\n" + // TODO
|
||||
"\n" + // TODO Should be the above
|
||||
"This is the Master Title\n" +
|
||||
"This text comes from the Master Slide\n" +
|
||||
"\n" +
|
||||
// TODO Detect we didn't have a title, and include the master one
|
||||
"2nd page subtitle\n" +
|
||||
// "This text comes from the Master Slide\n" + // TODO
|
||||
"Footer from the master slide\n"
|
||||
"Footer from the master slide\n" +
|
||||
"This is the Master Title\n" +
|
||||
"This text comes from the Master Slide\n"
|
||||
, text
|
||||
);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user