diff --git a/build.xml b/build.xml
index a11bb9426..5989ddad3 100644
--- a/build.xml
+++ b/build.xml
@@ -1124,6 +1124,21 @@ FORREST_HOME environment variable!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
new file mode 100644
index 000000000..b0e736401
--- /dev/null
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
@@ -0,0 +1,122 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+
+public class HXFPowerPointExtractor extends POIXMLTextExtractor {
+ private HSLFXMLSlideShow slideshow;
+
+ public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+ this(new HSLFXMLSlideShow(
+ new HSLFXML(container)
+ ));
+ }
+ public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
+ super(slideshow);
+ this.slideshow = slideshow;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" HXFPowerPointExtractor ");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new HXFPowerPointExtractor(HXFDocument.openPackage(
+ new File(args[0])
+ ));
+ System.out.println(extractor.getText());
+ }
+
+ /**
+ * Gets the slide and notes text
+ */
+ public String getText() {
+ return getText(true, true);
+ }
+
+ /**
+ * Gets the requested text from the file
+ * @param slideText Should we retrieve text from slides?
+ * @param notesText Should we retrieve text from notes?
+ */
+ public String getText(boolean slideText, boolean notesText) {
+ StringBuffer text = new StringBuffer();
+
+ CTSlideIdListEntry[] slideRefs =
+ slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
+ for (int i = 0; i < slideRefs.length; i++) {
+ try {
+ CTSlide slide =
+ slideshow._getHSLFXML().getSlide(slideRefs[i]);
+ CTNotesSlide notes =
+ slideshow._getHSLFXML().getNotes(slideRefs[i]);
+
+ if(slideText) {
+ extractText(slide.getCSld().getSpTree(), text);
+ }
+ if(notesText && notes != null) {
+ extractText(notes.getCSld().getSpTree(), text);
+ }
+ } catch(Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return text.toString();
+ }
+
+ private void extractText(CTGroupShape gs, StringBuffer text) {
+ CTShape[] shapes = gs.getSpArray();
+ for (int i = 0; i < shapes.length; i++) {
+ CTTextBody textBody =
+ shapes[i].getTxBody();
+ if(textBody != null) {
+ CTTextParagraph[] paras =
+ textBody.getPArray();
+ for (int j = 0; j < paras.length; j++) {
+ CTRegularTextRun[] textRuns =
+ paras[j].getRArray();
+ for (int k = 0; k < textRuns.length; k++) {
+ text.append( textRuns[k].getT() );
+ }
+ // End each paragraph with a new line
+ text.append("\n");
+ }
+ }
+ }
+ }
+}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
index 11e7efd28..9c122da6d 100644
--- a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
@@ -21,7 +21,6 @@ import java.io.File;
import org.apache.poi.hxf.HXFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
new file mode 100644
index 000000000..7c96c2986
--- /dev/null
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
@@ -0,0 +1,101 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFPowerPointExtractor
+ */
+public class TestHXFPowerPointExtractor extends TestCase {
+ /**
+ * A simple file
+ */
+ private HSLFXML xmlA;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ File fileA = new File(
+ System.getProperty("HSLF.testdata.path") +
+ File.separator + "sample.pptx"
+ );
+
+ xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ new HXFPowerPointExtractor(xmlA.getPackage());
+ new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
+
+ HXFPowerPointExtractor extractor =
+ new HXFPowerPointExtractor(xmlA.getPackage());
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check Basics
+ assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
+ assertTrue(text.endsWith("amet\n\n\n\n"));
+
+ // Just slides, no notes
+ text = extractor.getText(true, false);
+ assertEquals(
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n", text
+ );
+
+ // Just notes, no slides
+ text = extractor.getText(false, true);
+ assertEquals(
+ "\n\n\n\n", text
+ );
+
+ // Both
+ text = extractor.getText(true, true);
+ assertEquals(
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n\n\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n\n\n", text
+ );
+ }
+}