OOXML pptx text extractor, and test. Also add jar-ooxml ant task
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e5fe6f32bd
commit
c6d225103f
15
build.xml
15
build.xml
@ -1124,6 +1124,21 @@ FORREST_HOME environment variable!</echo>
|
|||||||
</manifest>
|
</manifest>
|
||||||
</jar>
|
</jar>
|
||||||
</target>
|
</target>
|
||||||
|
<target name="jar-ooxml" depends="compile-ooxml" description="Creates the ooxml jar files for distribution">
|
||||||
|
<jar destfile="${dist.dir}/${jar.name}-ooxml-${version.id}-${DSTAMP}.jar">
|
||||||
|
<fileset dir="${ooxml.output.dir}" />
|
||||||
|
<fileset dir="legal/" />
|
||||||
|
<manifest>
|
||||||
|
<attribute name="Built-By" value="${user.name}"/>
|
||||||
|
<attribute name="Specification-Title" value="Apache POI"/>
|
||||||
|
<attribute name="Specification-Version" value="${version.id}-${DSTAMP}"/>
|
||||||
|
<attribute name="Specification-Vendor" value="Apache"/>
|
||||||
|
<attribute name="Implementation-Title" value="Apache POI"/>
|
||||||
|
<attribute name="Implementation-Version" value="${version.id}-${DSTAMP}"/>
|
||||||
|
<attribute name="Implementation-Vendor" value="Apache"/>
|
||||||
|
</manifest>
|
||||||
|
</jar>
|
||||||
|
</target>
|
||||||
|
|
||||||
<target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
|
<target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
|
||||||
description="Creates the entire distribution into build/dist, from scratch">
|
description="Creates the entire distribution into build/dist, from scratch">
|
||||||
|
@ -0,0 +1,122 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hslf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
|
import org.apache.poi.hslf.HSLFXML;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
import org.openxml4j.opc.Package;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
||||||
|
|
||||||
|
public class HXFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
|
private HSLFXMLSlideShow slideshow;
|
||||||
|
|
||||||
|
public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||||
|
this(new HSLFXMLSlideShow(
|
||||||
|
new HSLFXML(container)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
|
||||||
|
super(slideshow);
|
||||||
|
this.slideshow = slideshow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if(args.length < 1) {
|
||||||
|
System.err.println("Use:");
|
||||||
|
System.err.println(" HXFPowerPointExtractor <filename.pptx>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
POIXMLTextExtractor extractor =
|
||||||
|
new HXFPowerPointExtractor(HXFDocument.openPackage(
|
||||||
|
new File(args[0])
|
||||||
|
));
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the slide and notes text
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
return getText(true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the requested text from the file
|
||||||
|
* @param slideText Should we retrieve text from slides?
|
||||||
|
* @param notesText Should we retrieve text from notes?
|
||||||
|
*/
|
||||||
|
public String getText(boolean slideText, boolean notesText) {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
CTSlideIdListEntry[] slideRefs =
|
||||||
|
slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
|
||||||
|
for (int i = 0; i < slideRefs.length; i++) {
|
||||||
|
try {
|
||||||
|
CTSlide slide =
|
||||||
|
slideshow._getHSLFXML().getSlide(slideRefs[i]);
|
||||||
|
CTNotesSlide notes =
|
||||||
|
slideshow._getHSLFXML().getNotes(slideRefs[i]);
|
||||||
|
|
||||||
|
if(slideText) {
|
||||||
|
extractText(slide.getCSld().getSpTree(), text);
|
||||||
|
}
|
||||||
|
if(notesText && notes != null) {
|
||||||
|
extractText(notes.getCSld().getSpTree(), text);
|
||||||
|
}
|
||||||
|
} catch(Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractText(CTGroupShape gs, StringBuffer text) {
|
||||||
|
CTShape[] shapes = gs.getSpArray();
|
||||||
|
for (int i = 0; i < shapes.length; i++) {
|
||||||
|
CTTextBody textBody =
|
||||||
|
shapes[i].getTxBody();
|
||||||
|
if(textBody != null) {
|
||||||
|
CTTextParagraph[] paras =
|
||||||
|
textBody.getPArray();
|
||||||
|
for (int j = 0; j < paras.length; j++) {
|
||||||
|
CTRegularTextRun[] textRuns =
|
||||||
|
paras[j].getRArray();
|
||||||
|
for (int k = 0; k < textRuns.length; k++) {
|
||||||
|
text.append( textRuns[k].getT() );
|
||||||
|
}
|
||||||
|
// End each paragraph with a new line
|
||||||
|
text.append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -21,7 +21,6 @@ import java.io.File;
|
|||||||
import org.apache.poi.hxf.HXFDocument;
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
import org.openxml4j.opc.Package;
|
import org.openxml4j.opc.Package;
|
||||||
import org.openxml4j.opc.PackagePart;
|
import org.openxml4j.opc.PackagePart;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
|
||||||
|
|
||||||
|
@ -0,0 +1,101 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hslf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import org.apache.poi.hslf.HSLFXML;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for HXFPowerPointExtractor
|
||||||
|
*/
|
||||||
|
public class TestHXFPowerPointExtractor extends TestCase {
|
||||||
|
/**
|
||||||
|
* A simple file
|
||||||
|
*/
|
||||||
|
private HSLFXML xmlA;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
File fileA = new File(
|
||||||
|
System.getProperty("HSLF.testdata.path") +
|
||||||
|
File.separator + "sample.pptx"
|
||||||
|
);
|
||||||
|
|
||||||
|
xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get text out of the simple file
|
||||||
|
*/
|
||||||
|
public void testGetSimpleText() throws Exception {
|
||||||
|
new HXFPowerPointExtractor(xmlA.getPackage());
|
||||||
|
new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
|
||||||
|
|
||||||
|
HXFPowerPointExtractor extractor =
|
||||||
|
new HXFPowerPointExtractor(xmlA.getPackage());
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
|
// Check Basics
|
||||||
|
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
|
||||||
|
assertTrue(text.endsWith("amet\n\n\n\n"));
|
||||||
|
|
||||||
|
// Just slides, no notes
|
||||||
|
text = extractor.getText(true, false);
|
||||||
|
assertEquals(
|
||||||
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
|
"\n" +
|
||||||
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
|
"Lorem\n" +
|
||||||
|
"ipsum\n" +
|
||||||
|
"dolor\n" +
|
||||||
|
"sit\n" +
|
||||||
|
"amet\n" +
|
||||||
|
"\n", text
|
||||||
|
);
|
||||||
|
|
||||||
|
// Just notes, no slides
|
||||||
|
text = extractor.getText(false, true);
|
||||||
|
assertEquals(
|
||||||
|
"\n\n\n\n", text
|
||||||
|
);
|
||||||
|
|
||||||
|
// Both
|
||||||
|
text = extractor.getText(true, true);
|
||||||
|
assertEquals(
|
||||||
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||||
|
"\n\n\n" +
|
||||||
|
"Lorem ipsum dolor sit amet\n" +
|
||||||
|
"Lorem\n" +
|
||||||
|
"ipsum\n" +
|
||||||
|
"dolor\n" +
|
||||||
|
"sit\n" +
|
||||||
|
"amet\n" +
|
||||||
|
"\n\n\n", text
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user