OOXML pptx text extractor, and test. Also add jar-ooxml ant task

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607572 13f79535-47bb-0310-9956-ffa450edef68
2007-12-30 18:11:55 +00:00 · 2007-12-30 18:11:55 +00:00 · c6d225103f
commit c6d225103f
parent e5fe6f32bd
4 changed files with 238 additions and 1 deletions
--- a/build.xml
+++ b/build.xml
@ -1124,6 +1124,21 @@ FORREST_HOME environment variable!</echo>
            </manifest>        
        </jar>
    </target>
+    <target name="jar-ooxml" depends="compile-ooxml" description="Creates the ooxml jar files for distribution">
+        <jar destfile="${dist.dir}/${jar.name}-ooxml-${version.id}-${DSTAMP}.jar">
+			<fileset dir="${ooxml.output.dir}" />
+			<fileset dir="legal/" />
+            <manifest>
+                <attribute name="Built-By" value="${user.name}"/>
+                <attribute name="Specification-Title" value="Apache POI"/>
+                <attribute name="Specification-Version" value="${version.id}-${DSTAMP}"/>
+                <attribute name="Specification-Vendor" value="Apache"/>
+                <attribute name="Implementation-Title" value="Apache POI"/>
+                <attribute name="Implementation-Version" value="${version.id}-${DSTAMP}"/>
+                <attribute name="Implementation-Vendor" value="Apache"/>
+            </manifest>        
+        </jar>
+    </target>

  <target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
    description="Creates the entire distribution into build/dist, from scratch">
--- a/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
+++ b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
@ -0,0 +1,122 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+
+public class HXFPowerPointExtractor extends POIXMLTextExtractor {
+	private HSLFXMLSlideShow slideshow;
+	
+	public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+		this(new HSLFXMLSlideShow(
+				new HSLFXML(container)
+		));
+	}
+	public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
+		super(slideshow);
+		this.slideshow = slideshow;
+	}
+
+	public static void main(String[] args) throws Exception {
+		if(args.length < 1) {
+			System.err.println("Use:");
+			System.err.println("  HXFPowerPointExtractor <filename.pptx>");
+			System.exit(1);
+		}
+		POIXMLTextExtractor extractor = 
+			new HXFPowerPointExtractor(HXFDocument.openPackage(
+					new File(args[0])
+			));
+		System.out.println(extractor.getText());
+	}
+	
+	/**
+	 * Gets the slide and notes text
+	 */
+	public String getText() {
+		return getText(true, true);
+	}
+	
+	/**
+	 * Gets the requested text from the file
+	 * @param slideText Should we retrieve text from slides?
+	 * @param notesText Should we retrieve text from notes?
+	 */
+	public String getText(boolean slideText, boolean notesText) {
+		StringBuffer text = new StringBuffer();
+		
+		CTSlideIdListEntry[] slideRefs =
+			slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
+		for (int i = 0; i < slideRefs.length; i++) {
+			try {
+				CTSlide slide =
+					slideshow._getHSLFXML().getSlide(slideRefs[i]);
+				CTNotesSlide notes = 
+					slideshow._getHSLFXML().getNotes(slideRefs[i]);
+				
+				if(slideText) {
+					extractText(slide.getCSld().getSpTree(), text);
+				}
+				if(notesText && notes != null) {
+					extractText(notes.getCSld().getSpTree(), text);
+				}
+			} catch(Exception e) {
+				throw new RuntimeException(e);
+			}
+		}
+		
+		return text.toString();
+	}
+	
+	private void extractText(CTGroupShape gs, StringBuffer text) {
+		CTShape[] shapes = gs.getSpArray();
+		for (int i = 0; i < shapes.length; i++) {
+			CTTextBody textBody =
+				shapes[i].getTxBody();
+			if(textBody != null) {
+				CTTextParagraph[] paras = 
+					textBody.getPArray();
+				for (int j = 0; j < paras.length; j++) {
+					CTRegularTextRun[] textRuns =
+						paras[j].getRArray();
+					for (int k = 0; k < textRuns.length; k++) {
+						text.append( textRuns[k].getT() );
+					}
+					// End each paragraph with a new line
+					text.append("\n");
+				}
+			}
+		}
+	}
+}
--- a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
@ -21,7 +21,6 @@ import java.io.File;
 import org.apache.poi.hxf.HXFDocument;
 import org.openxml4j.opc.Package;
 import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;

--- a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
+++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
@ -0,0 +1,101 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFPowerPointExtractor
+ */
+public class TestHXFPowerPointExtractor extends TestCase {
+	/**
+	 * A simple file
+	 */
+	private HSLFXML xmlA;
+
+	protected void setUp() throws Exception {
+		super.setUp();
+		
+		File fileA = new File(
+				System.getProperty("HSLF.testdata.path") +
+				File.separator + "sample.pptx"
+		);
+		
+		xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
+	}
+
+	/**
+	 * Get text out of the simple file
+	 */
+	public void testGetSimpleText() throws Exception {
+		new HXFPowerPointExtractor(xmlA.getPackage());
+		new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
+		
+		HXFPowerPointExtractor extractor = 
+			new HXFPowerPointExtractor(xmlA.getPackage());
+		extractor.getText();
+		
+		String text = extractor.getText();
+		assertTrue(text.length() > 0);
+		
+		// Check Basics
+		assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
+		assertTrue(text.endsWith("amet\n\n\n\n"));
+		
+		// Just slides, no notes
+		text = extractor.getText(true, false);
+		assertEquals(
+				"Lorem ipsum dolor sit amet\n" +
+				"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+				"\n" +
+				"Lorem ipsum dolor sit amet\n" +
+				"Lorem\n" +
+				"ipsum\n" +
+				"dolor\n" +
+				"sit\n" +
+				"amet\n" +
+				"\n", text
+		);
+		
+		// Just notes, no slides
+		text = extractor.getText(false, true);
+		assertEquals(
+				"\n\n\n\n", text
+		);
+		
+		// Both
+		text = extractor.getText(true, true);
+		assertEquals(
+				"Lorem ipsum dolor sit amet\n" +
+				"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+				"\n\n\n" +
+				"Lorem ipsum dolor sit amet\n" +
+				"Lorem\n" +
+				"ipsum\n" +
+				"dolor\n" +
+				"sit\n" +
+				"amet\n" +
+				"\n\n\n", text
+		);
+	}
+}