diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index ac5d2f636..0317cd9d2 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning New class org.apache.poi.hssf.record.RecordFormatException, which DDF uses instead of the HSSF version, and the HSSF version inherits from diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index a9e1cbdc6..ce1869e2c 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning New class org.apache.poi.hssf.record.RecordFormatException, which DDF uses instead of the HSSF version, and the HSSF version inherits from diff --git a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java index 0f28c2fdc..4366f82e7 100644 --- a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java +++ b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java @@ -16,18 +16,20 @@ ==================================================================== */ package org.apache.poi.xslf.extractor; -import java.io.File; import java.io.IOException; -import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.xslf.XSLFSlideShow; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.xmlbeans.XmlException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; +import org.openxmlformats.schemas.presentationml.x2006.main.CTComment; +import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList; import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; @@ -35,17 +37,20 @@ import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; public class XSLFPowerPointExtractor extends POIXMLTextExtractor { - private XSLFSlideShow slideshow; + private XMLSlideShow slideshow; private boolean slidesByDefault = true; private boolean notesByDefault = false; + public XSLFPowerPointExtractor(XMLSlideShow slideshow) { + super(slideshow._getXSLFSlideShow()); + this.slideshow = slideshow; + } + public XSLFPowerPointExtractor(XSLFSlideShow slideshow) throws XmlException, IOException { + this(new XMLSlideShow(slideshow)); + } public XSLFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException { this(new XSLFSlideShow(container)); } - public XSLFPowerPointExtractor(XSLFSlideShow slideshow) { - super(slideshow); - this.slideshow = slideshow; - } public static void main(String[] args) throws Exception { if(args.length < 1) { @@ -88,18 +93,32 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor { */ public String getText(boolean slideText, boolean notesText) { StringBuffer text = new StringBuffer(); - - CTSlideIdListEntry[] slideRefs = - slideshow.getSlideReferences().getSldIdArray(); - for (int i = 0; i < slideRefs.length; i++) { + + XSLFSlide[] slides = slideshow.getSlides(); + for(int i = 0; i < slides.length; i++) { + CTSlide rawSlide = slides[i]._getCTSlide(); + CTSlideIdListEntry slideId = slides[i]._getCTSlideId(); + try { - CTSlide slide = - slideshow.getSlide(slideRefs[i]); + // For now, still very low level CTNotesSlide notes = - slideshow.getNotes(slideRefs[i]); + slideshow._getXSLFSlideShow().getNotes(slideId); + CTCommentList comments = + slideshow._getXSLFSlideShow().getSlideComments(slideId); if(slideText) { - extractText(slide.getCSld().getSpTree(), text); + extractText(rawSlide.getCSld().getSpTree(), text); + + // Comments too for the slide + if(comments != null) { + for(CTComment comment : comments.getCmArray()) { + // TODO - comment authors too + // (They're in another stream) + text.append( + comment.getText() + "\n" + ); + } + } } if(notesText && notes != null) { extractText(notes.getCSld().getSpTree(), text); diff --git a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java index fb10a2421..ac3135637 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java @@ -108,4 +108,22 @@ public class TestXSLFPowerPointExtractor extends TestCase { "\n\n\n\n", text ); } + + public void testGetComments() throws Exception { + File file = new File( + System.getProperty("HSLF.testdata.path") + + File.separator + "45545_Comment.pptx" + ); + assertTrue(file.exists()); + + xmlA = new XSLFSlideShow(file.toString()); + XSLFPowerPointExtractor extractor = + new XSLFPowerPointExtractor(xmlA); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check comments are there + assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc")); + } }