XLSF text extraction improvements relating to TIKA-712
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8de1ce7c6a
commit
9e1ca2caea
@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
|||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.xslf.XSLFSlideShow;
|
import org.apache.poi.xslf.XSLFSlideShow;
|
||||||
import org.apache.poi.xslf.usermodel.DrawingParagraph;
|
import org.apache.poi.xslf.usermodel.DrawingParagraph;
|
||||||
|
import org.apache.poi.xslf.usermodel.DrawingTextBody;
|
||||||
|
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
|
||||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
|
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFComments;
|
import org.apache.poi.xslf.usermodel.XSLFComments;
|
||||||
@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
|
|||||||
import org.apache.poi.xslf.usermodel.XSLFNotes;
|
import org.apache.poi.xslf.usermodel.XSLFNotes;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||||
|
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
|
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
|
||||||
@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
|||||||
try {
|
try {
|
||||||
XSLFNotes notes = slide.getNotes();
|
XSLFNotes notes = slide.getNotes();
|
||||||
XSLFComments comments = slide.getComments();
|
XSLFComments comments = slide.getComments();
|
||||||
|
XSLFSlideLayout layout = slide.getSlideLayout();
|
||||||
XSLFSlideMaster master = slide.getMasterSheet();
|
XSLFSlideMaster master = slide.getMasterSheet();
|
||||||
|
|
||||||
// TODO Do the slide's name
|
// TODO Do the slide's name
|
||||||
@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
|||||||
|
|
||||||
// Do the slide's text if requested
|
// Do the slide's text if requested
|
||||||
if (slideText) {
|
if (slideText) {
|
||||||
extractText(slide.getCommonSlideData(), text);
|
extractText(slide.getCommonSlideData(), false, text);
|
||||||
|
|
||||||
// If there's a master sheet and it's requested, grab text from there
|
// If requested, get text from the master and it's layout
|
||||||
if(masterText && master != null) {
|
if(masterText) {
|
||||||
extractText(master.getCommonSlideData(), text);
|
if(layout != null) {
|
||||||
|
extractText(layout.getCommonSlideData(), true, text);
|
||||||
|
}
|
||||||
|
if(master != null) {
|
||||||
|
extractText(master.getCommonSlideData(), true, text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the slide has comments, do those too
|
// If the slide has comments, do those too
|
||||||
@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
|||||||
|
|
||||||
// Do the notes if requested
|
// Do the notes if requested
|
||||||
if (notesText && notes != null) {
|
if (notesText && notes != null) {
|
||||||
extractText(notes.getCommonSlideData(), text);
|
extractText(notes.getCommonSlideData(), false, text);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
|||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void extractText(XSLFCommonSlideData data, StringBuffer text) {
|
private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
|
||||||
for (DrawingParagraph p : data.getText()) {
|
for(DrawingTextBody textBody : data.getDrawingText()) {
|
||||||
|
if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
|
||||||
|
DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
|
||||||
|
if(! ph.isPlaceholderCustom()) {
|
||||||
|
// Skip non-customised placeholder text
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (DrawingParagraph p : textBody.getParagraphs()) {
|
||||||
text.append(p.getText());
|
text.append(p.getText());
|
||||||
text.append("\n");
|
text.append("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,16 +17,16 @@
|
|||||||
|
|
||||||
package org.apache.poi.xslf.usermodel;
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class DrawingTextBody {
|
public class DrawingTextBody {
|
||||||
private final CTTextBody textBody;
|
private final CTTextBody textBody;
|
||||||
|
|
||||||
public DrawingTextBody(CTTextBody textBody) {
|
public DrawingTextBody(CTTextBody textBody) {
|
||||||
this.textBody = textBody;
|
this.textBody = textBody;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DrawingParagraph[] getParagraphs() {
|
public DrawingParagraph[] getParagraphs() {
|
||||||
|
@ -0,0 +1,57 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link DrawingTextBody} which is a placeholder
|
||||||
|
* @author nick
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class DrawingTextPlaceholder extends DrawingTextBody {
|
||||||
|
private final CTPlaceholder placeholder;
|
||||||
|
|
||||||
|
public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
|
||||||
|
super(textBody);
|
||||||
|
this.placeholder = placeholder;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What kind of placeholder is this?
|
||||||
|
*/
|
||||||
|
public String getPlaceholderType() {
|
||||||
|
return placeholder.getType().toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* What kind of placeholder is this?
|
||||||
|
*/
|
||||||
|
public STPlaceholderType.Enum getPlaceholderTypeEnum() {
|
||||||
|
return placeholder.getType();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the PlaceHolder text customised?
|
||||||
|
*/
|
||||||
|
public boolean isPlaceholderCustom() {
|
||||||
|
return placeholder.getHasCustomPrompt();
|
||||||
|
}
|
||||||
|
}
|
@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
|
|||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
||||||
@ -43,10 +44,10 @@ public class XSLFCommonSlideData {
|
|||||||
this.data = data;
|
this.data = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<DrawingParagraph> getText() {
|
public List<DrawingTextBody> getDrawingText() {
|
||||||
CTGroupShape gs = data.getSpTree();
|
CTGroupShape gs = data.getSpTree();
|
||||||
|
|
||||||
List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
|
List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();
|
||||||
|
|
||||||
processShape(gs, out);
|
processShape(gs, out);
|
||||||
|
|
||||||
@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
|
|||||||
for (DrawingTableRow row : table.getRows()) {
|
for (DrawingTableRow row : table.getRows()) {
|
||||||
for (DrawingTableCell cell : row.getCells()) {
|
for (DrawingTableCell cell : row.getCells()) {
|
||||||
DrawingTextBody textBody = cell.getTextBody();
|
DrawingTextBody textBody = cell.getTextBody();
|
||||||
|
out.add(textBody);
|
||||||
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,19 +89,31 @@ public class XSLFCommonSlideData {
|
|||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
public List<DrawingParagraph> getText() {
|
||||||
|
List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
|
||||||
|
for(DrawingTextBody textBody : getDrawingText()) {
|
||||||
|
paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||||
|
}
|
||||||
|
return paragraphs;
|
||||||
|
}
|
||||||
|
|
||||||
private void processShape(CTGroupShape gs, List<DrawingParagraph> out) {
|
private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
|
||||||
List<CTShape> shapes = gs.getSpList();
|
List<CTShape> shapes = gs.getSpList();
|
||||||
for (int i = 0; i < shapes.size(); i++) {
|
for (CTShape shape : shapes) {
|
||||||
CTTextBody ctTextBody = shapes.get(i).getTxBody();
|
CTTextBody ctTextBody = shape.getTxBody();
|
||||||
if (ctTextBody==null) {
|
if (ctTextBody==null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
|
DrawingTextBody textBody;
|
||||||
|
CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr();
|
||||||
|
if(nvpr.isSetPh()) {
|
||||||
|
textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
|
||||||
|
} else {
|
||||||
|
textBody = new DrawingTextBody(ctTextBody);
|
||||||
|
}
|
||||||
|
|
||||||
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
out.add(textBody);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
|||||||
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
|
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
|
||||||
assertTrue(text.contains("amet\n\n"));
|
assertTrue(text.contains("amet\n\n"));
|
||||||
|
|
||||||
// Our master text, for tests
|
// Our placeholder master text
|
||||||
|
// This shouldn't show up in the output
|
||||||
String masterText =
|
String masterText =
|
||||||
"Click to edit Master title style\n" +
|
"Click to edit Master title style\n" +
|
||||||
|
"Click to edit Master subtitle style\n" +
|
||||||
|
"\n\n\n\n\n\n" +
|
||||||
|
"Click to edit Master title style\n" +
|
||||||
"Click to edit Master text styles\n" +
|
"Click to edit Master text styles\n" +
|
||||||
"Second level\n" +
|
"Second level\n" +
|
||||||
"Third level\n" +
|
"Third level\n" +
|
||||||
@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
|||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
"\n" +
|
"\n" +
|
||||||
masterText +
|
|
||||||
"\n\n\n" +
|
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Lorem\n" +
|
"Lorem\n" +
|
||||||
"ipsum\n" +
|
"ipsum\n" +
|
||||||
"dolor\n" +
|
"dolor\n" +
|
||||||
"sit\n" +
|
"sit\n" +
|
||||||
"amet\n" +
|
"amet\n" +
|
||||||
"\n" +
|
"\n"
|
||||||
masterText +
|
|
||||||
"\n\n\n"
|
|
||||||
, text
|
, text
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
|||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
"\n" +
|
"\n" +
|
||||||
masterText +
|
"\n\n" +
|
||||||
"\n\n\n\n\n" +
|
|
||||||
"Lorem ipsum dolor sit amet\n" +
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
"Lorem\n" +
|
"Lorem\n" +
|
||||||
"ipsum\n" +
|
"ipsum\n" +
|
||||||
"dolor\n" +
|
"dolor\n" +
|
||||||
"sit\n" +
|
"sit\n" +
|
||||||
"amet\n" +
|
"amet\n" +
|
||||||
"\n" +
|
"\n\n\n"
|
||||||
masterText +
|
|
||||||
"\n\n\n\n\n"
|
|
||||||
, text
|
, text
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
|||||||
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
|
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
|
||||||
XSLFPowerPointExtractor extractor =
|
XSLFPowerPointExtractor extractor =
|
||||||
new XSLFPowerPointExtractor(xml);
|
new XSLFPowerPointExtractor(xml);
|
||||||
|
extractor.setSlidesByDefault(true);
|
||||||
|
extractor.setNotesByDefault(false);
|
||||||
|
extractor.setMasterByDefault(true);
|
||||||
|
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
assertTrue(text.length() > 0);
|
assertTrue(text.length() > 0);
|
||||||
@ -184,16 +184,27 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
|||||||
assertTrue("Unable to find expected word in text\n" + text,
|
assertTrue("Unable to find expected word in text\n" + text,
|
||||||
text.contains("Footer from the master slide"));
|
text.contains("Footer from the master slide"));
|
||||||
|
|
||||||
|
// Theme text shouldn't show up
|
||||||
|
String themeText =
|
||||||
|
"Theme Master Title\n" +
|
||||||
|
"Theme Master first level\n" +
|
||||||
|
"And the 2nd level\n" +
|
||||||
|
"Our 3rd level goes here\n" +
|
||||||
|
"And onto the 4th, such fun….\n" +
|
||||||
|
"Finally is the Fifth level\n";
|
||||||
|
|
||||||
// Check the whole text
|
// Check the whole text
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"First page title\n" +
|
"First page title\n" +
|
||||||
"First page subtitle\n" +
|
"First page subtitle\n" +
|
||||||
// "This text comes from the Master Slide\n" + // TODO
|
"This is the Master Title\n" +
|
||||||
// "This is the Master Title\n" + // TODO
|
"This text comes from the Master Slide\n" +
|
||||||
"\n" + // TODO Should be the above
|
"\n" +
|
||||||
|
// TODO Detect we didn't have a title, and include the master one
|
||||||
"2nd page subtitle\n" +
|
"2nd page subtitle\n" +
|
||||||
// "This text comes from the Master Slide\n" + // TODO
|
"Footer from the master slide\n" +
|
||||||
"Footer from the master slide\n"
|
"This is the Master Title\n" +
|
||||||
|
"This text comes from the Master Slide\n"
|
||||||
, text
|
, text
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user