As mentioned on dev@ - zap the old, initial OOXML code out of trunk. It isn't compatible with the new code in the ooxml branch, which may catch users out, and anyone wanting OOXML support should be using the code in the branch to get updates and lots more features

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@650915 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-23 15:45:42 +00:00
parent 5e615eeeba
commit 71413188e1
23 changed files with 2 additions and 2496 deletions

176
build.xml
View File

@ -74,7 +74,6 @@ under the License.
<property name="main.output.dir" location="build/classes"/>
<property name="main.output.test.dir" location="build/test-classes"/>
<property name="main.lib" location="lib"/>
<property name="ooxml.lib" location="ooxml-lib"/>
<property name="main.reports.test" location="build/test-results"/>
<property name="main.jar1.dir" location="${main.lib}/commons-logging-1.1.jar"/>
<property name="main.jar1.url" value="${repository}/commons-logging/jars/commons-logging-1.1.jar"/>
@ -124,33 +123,6 @@ under the License.
<property name="examples.jar3.url" value="${repository}/commons-lang/jars/commons-lang-2.1.jar"/>
<property name="examples.testokfile" location="build/examples-testokfile.txt"/>
<!-- Experimental OOXML support: -->
<property name="ooxml.src" location="src/scratchpad/ooxml-src"/>
<property name="ooxml.src.test" location="src/scratchpad/ooxml-testcases"/>
<property name="ooxml.reports.test" location="build/ooxml-test-results"/>
<property name="ooxml.output.dir" location="build/ooxml-classes"/>
<property name="ooxml.output.test.dir" location="build/ooxml-test-classes"/>
<property name="ooxml.testokfile" location="build/ooxml-testokfile.txt"/>
<property name="ooxml.jar1.dir" location="${ooxml.lib}/dom4j-1.6.1.jar"/>
<property name="ooxml.jar1.url" value="${repository}/dom4j/jars/dom4j-1.6.1.jar"/>
<property name="ooxml.jar2.dir" location="${ooxml.lib}/jaxen-1.1.jar"/>
<property name="ooxml.jar2.url" value="${repository}/jaxen/jars/jaxen-1.1.jar"/>
<property name="ooxml.jar3.dir" location="${ooxml.lib}/xmlbeans-2.3.0.jar"/>
<property name="ooxml.jar3.url" value="${repository}/org.apache.xmlbeans/jars/xmlbeans-2.3.0.jar"/>
<property name="ooxml.jar4.dir" location="${ooxml.lib}/jsr173_1.0_api.jar"/>
<property name="ooxml.jar4.url" value="${repository}/xmlbeans/jars/jsr173_1.0_api.jar"/>
<!-- No official release of openxml4j yet -->
<property name="ooxml.jar5.dir" location="${ooxml.lib}/openxml4j-bin-alpha-080124.jar"/>
<property name="ooxml.jar5.url" value="http://people.apache.org/~nick/openxml4j-bin-prealpha-071224.jar"/>
<!-- See http://www.ecma-international.org/publications/standards/Ecma-376.htm -->
<!-- "Copy these file(s), free of charge" -->
<property name="ooxml.xsds.ozip" location="${ooxml.lib}/OfficeOpenXML-Part4.zip"/>
<property name="ooxml.xsds.izip" location="${ooxml.lib}/OfficeOpenXML-XMLSchema.zip"/>
<property name="ooxml.xsds.url" value="http://www.ecma-international.org/publications/files/ECMA-ST/Office%20Open%20XML%20Part%204%20(DOCX).zip" />
<property name="ooxml.xsds.jar" location="${ooxml.lib}/ooxml-schemas.jar"/>
<property name="build.site" location="build/tmp/site/build/site"/>
<property name="build.site.src" location="build/tmp/site"/>
<property name="junit.report.dir" location="${build.site}/junit"/>
@ -195,15 +167,6 @@ under the License.
<pathelement location="${contrib.output.test.dir}"/>
</path>
<path id="ooxml.classpath">
<path refid="main.classpath"/>
<path refid="scratchpad.classpath"/>
<fileset dir="${ooxml.lib}">
<include name="*.jar" />
</fileset>
</path>
<path id="examples.classpath">
<path refid="main.classpath"/>
<pathelement location="${main.output.dir}"/>
@ -268,15 +231,12 @@ under the License.
<mkdir dir="${scratchpad.output.dir}"/>
<mkdir dir="${contrib.output.dir}"/>
<mkdir dir="${examples.output.dir}"/>
<mkdir dir="${ooxml.output.dir}"/>
<mkdir dir="${main.output.test.dir}"/>
<mkdir dir="${contrib.output.test.dir}"/>
<mkdir dir="${scratchpad.output.test.dir}"/>
<mkdir dir="${ooxml.output.test.dir}"/>
<mkdir dir="${main.reports.test}"/>
<mkdir dir="${scratchpad.reports.test}"/>
<mkdir dir="${contrib.reports.test}"/>
<mkdir dir="${ooxml.reports.test}"/>
<mkdir dir="${junit.report.dir}"/>
<mkdir dir="${jdepend.report.dir}"/>
<mkdir dir="${jdepend.report.out.dir}"/>
@ -311,11 +271,6 @@ under the License.
<available file="${contrib.jar2.dir}"/>
<available file="${contrib.jar3.dir}"/>
<available file="${junit.jar1.dir}"/>
<available file="${ooxml.jar1.dir}"/>
<available file="${ooxml.jar2.dir}"/>
<available file="${ooxml.jar3.dir}"/>
<available file="${ooxml.jar4.dir}"/>
<available file="${ooxml.jar5.dir}"/>
</and>
<isset property="disconnected"/>
</or>
@ -330,67 +285,8 @@ under the License.
<get src="${contrib.jar2.url}" dest="${contrib.jar2.dir}"/>
<get src="${contrib.jar3.url}" dest="${contrib.jar3.dir}"/>
<get src="${junit.jar1.url}" dest="${junit.jar1.dir}"/>
<get src="${ooxml.jar1.url}" dest="${ooxml.jar1.dir}"/>
<get src="${ooxml.jar2.url}" dest="${ooxml.jar2.dir}"/>
<get src="${ooxml.jar3.url}" dest="${ooxml.jar3.dir}"/>
<get src="${ooxml.jar4.url}" dest="${ooxml.jar4.dir}"/>
<get src="${ooxml.jar5.url}" dest="${ooxml.jar5.dir}"/>
</target>
<target name="check-ooxml-xsds">
<condition property="ooxml-xsds.present">
<or>
<and>
<available file="${ooxml.xsds.izip}"/>
</and>
<isset property="disconnected"/>
</or>
</condition>
</target>
<target name="fetch-ooxml-xsds" unless="ooxml-xsds.present"
description="Fetches needed OOXML xsd files from the Internet">
<get src="${ooxml.xsds.url}" dest="${ooxml.xsds.ozip}"/>
<unzip src="${ooxml.xsds.ozip}" dest="${ooxml.lib}">
<patternset>
<include name="OfficeOpenXML-XMLSchema.zip" />
</patternset>
</unzip>
</target>
<target name="check-compiled-ooxml-xsds">
<condition property="ooxml-compiled-xsds.present">
<or>
<and>
<available file="${ooxml.xsds.jar}"/>
</and>
<isset property="disconnected"/>
</or>
</condition>
</target>
<target name="compile-ooxml-xsds" unless="ooxml-compiled-xsds.present"
depends="check-jars,fetch-jars,check-ooxml-xsds,fetch-ooxml-xsds,check-compiled-ooxml-xsds"
description="Unpacks the OOXML xsd files, and compiles them into XmlBeans">
<taskdef name="xmlbean"
classname="org.apache.xmlbeans.impl.tool.XMLBean"
classpath="${ooxml.jar3.dir}:${ooxml.jar4.dir}" />
<unzip src="${ooxml.xsds.izip}" dest="build/ooxml-xsds/" />
<!--
schema="build/ooxml-xsds/"
schema="build/ooxml-xsds/sml-workbook.xsd"
-->
<xmlbean
schema="build/ooxml-xsds/"
destfile="${ooxml.xsds.jar}"
javasource="1.4"
failonerror="false"
fork="true"
memoryMaximumSize="512m"
>
<classpath refid="ooxml.classpath"/>
</xmlbean>
</target>
<target name="compile" depends="init, compile-main, compile-scratchpad,
compile-contrib, compile-examples"
description="Compiles the POI main classes, scratchpad, contrib, and examples"/>
@ -473,24 +369,6 @@ under the License.
</javac>
</target>
<target name="compile-ooxml" depends="init, check-ooxml-xsds, fetch-ooxml-xsds, compile-ooxml-xsds, compile-main">
<!-- openxml4j requires java 1.5, so so must we, for now -->
<javac target="1.5" source="1.5"
destdir="${ooxml.output.dir}" debug="on" srcdir="${ooxml.src}">
<classpath refid="ooxml.classpath"/>
</javac>
<javac target="1.5" source="1.5"
failonerror="true" destdir="${ooxml.output.test.dir}" debug="on"
fork="yes" srcdir="${ooxml.src.test}">
<classpath>
<path refid="ooxml.classpath"/>
<pathelement location="${ooxml.output.dir}"/>
<pathelement location="${junit.jar1.dir}"/>
</classpath>
</javac>
</target>
<target name="test" depends="test-main,test-scratchpad,test-contrib"
description="Tests main, contrib and scratchpad"/>
@ -725,43 +603,6 @@ under the License.
<echo file="${contrib.testokfile}" append="false" message="testok"/>
</target>
<target name="-test-ooxml-check">
<uptodate property="ooxml.test.notRequired" targetfile="${ooxml.testokfile}">
<srcfiles dir="${ooxml.src}"/>
<srcfiles dir="${ooxml.src.test}"/>
</uptodate>
</target>
<target name="test-ooxml" depends="compile-main,compile-ooxml,-test-ooxml-check" unless="ooxml.test.notRequired">
<junit printsummary="yes" fork="no" haltonfailure="${halt.on.test.failure}" failureproperty="ooxml.test.failed">
<classpath>
<path refid="ooxml.classpath"/>
<pathelement location="${main.output.dir}"/>
<pathelement location="${ooxml.output.dir}"/>
<pathelement location="${ooxml.output.test.dir}"/>
<pathelement location="${junit.jar1.dir}"/>
</classpath>
<sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/>
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/>
<formatter type="xml"/>
<batchtest todir="${ooxml.reports.test}">
<fileset dir="${ooxml.src.test}">
<include name="**/Test*.java"/>
<exclude name="**/AllTests.java"/>
</fileset>
</batchtest>
</junit>
<delete file="${ooxml.testokfile}"/>
<antcall target="-test-ooxml-write-testfile"/>
</target>
<target name="-test-ooxml-write-testfile" unless="ooxml.test.failed">
<echo file="${ooxml.testokfile}" append="false" message="testok"/>
</target>
<target name="-check-docs">
<uptodate property="main.docs.notRequired" targetfile="${build.site}/index.html">
<srcfiles dir="${build.site.src}"/>
@ -1148,21 +989,6 @@ FORREST_HOME environment variable!</echo>
</manifest>
</jar>
</target>
<target name="jar-ooxml" depends="compile-ooxml" description="Creates the ooxml jar files for distribution">
<jar destfile="${dist.dir}/${jar.name}-ooxml-${version.id}-${DSTAMP}.jar">
<fileset dir="${ooxml.output.dir}" />
<fileset dir="legal/" />
<manifest>
<attribute name="Built-By" value="${user.name}"/>
<attribute name="Specification-Title" value="Apache POI"/>
<attribute name="Specification-Version" value="${version.id}-${DSTAMP}"/>
<attribute name="Specification-Vendor" value="Apache"/>
<attribute name="Implementation-Title" value="Apache POI"/>
<attribute name="Implementation-Version" value="${version.id}-${DSTAMP}"/>
<attribute name="Implementation-Vendor" value="Apache"/>
</manifest>
</jar>
</target>
<target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
description="Creates the entire distribution into build/dist, from scratch">
@ -1171,6 +997,7 @@ FORREST_HOME environment variable!</echo>
<zip destfile="${dist.dir}/${jar.name}-bin-${version.id}-${DSTAMP}.zip">
<zipfileset dir="legal/" prefix="${zipdir}" />
<zipfileset dir="lib/" prefix="${zipdir}/lib" />
<zipfileset dir="${build.site}" prefix="${zipdir}/docs"/>
<zipfileset file="${dist.dir}/${jar.name}-${version.id}-${DSTAMP}.jar" prefix="${zipdir}" />
<zipfileset file="${dist.dir}/${jar.name}-contrib-${version.id}-${DSTAMP}.jar" prefix="${zipdir}" />
@ -1193,6 +1020,7 @@ FORREST_HOME environment variable!</echo>
<tar destfile="${dist.dir}/${jar.name}-bin-${version.id}-${DSTAMP}.tar.gz"
compression="gzip">
<tarfileset dir="legal/" prefix="${zipdir}" />
<tarfileset dir="lib/" prefix="${zipdir}/lib" />
<tarfileset dir="${build.site}" prefix="${zipdir}/docs"/>
<tarfileset file="${dist.dir}/${jar.name}-${version.id}-${DSTAMP}.jar" prefix="${zipdir}" />
<tarfileset file="${dist.dir}/${jar.name}-contrib-${version.id}-${DSTAMP}.jar" prefix="${zipdir}" />

View File

@ -1,45 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
import org.apache.poi.hxf.HXFDocument;
/**
* Parent class of all UserModel POI XML (ooxml)
* implementations.
* Provides a similar function to {@link POIDocument},
* for the XML based classes.
*/
public abstract class POIXMLDocument {
private HXFDocument document;
/**
* Creates a new POI XML Document, wrapping up
* the underlying raw HXFDocument
*/
protected POIXMLDocument(HXFDocument document) {
this.document = document;
}
/**
* Returns the underlying HXFDocument, typically
* used for unit testing
*/
public HXFDocument _getHXFDocument() {
return document;
}
}

View File

@ -1,31 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
protected POIXMLDocument document;
/**
* Creates a new text extractor for the given document
*/
public POIXMLTextExtractor(POIXMLDocument document) {
super(null);
this.document = document;
}
}

View File

@ -1,148 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf;
import java.io.IOException;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument;
import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument;
import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument;
import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
/**
* Experimental class to do low level processing
* of pptx files.
*
* If you are using these low level classes, then you
* will almost certainly need to refer to the OOXML
* specifications from
* http://www.ecma-international.org/publications/standards/Ecma-376.htm
*
* WARNING - APIs expected to change rapidly
*/
public class HSLFXML extends HXFDocument {
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
private PresentationDocument presentationDoc;
public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException {
super(container, MAIN_CONTENT_TYPE);
presentationDoc =
PresentationDocument.Factory.parse(basePart.getInputStream());
}
/**
* Returns the low level presentation base object
*/
public CTPresentation getPresentation() {
return presentationDoc.getPresentation();
}
/**
* Returns the references from the presentation to its
* slides.
* You'll need these to figure out the slide ordering,
* and to get at the actual slides themselves
*/
public CTSlideIdList getSlideReferences() {
return getPresentation().getSldIdLst();
}
/**
* Returns the references from the presentation to its
* slide masters.
* You'll need these to get at the actual slide
* masters themselves
*/
public CTSlideMasterIdList getSlideMasterReferences() {
return getPresentation().getSldMasterIdLst();
}
/**
* Returns the low level slide master object from
* the supplied slide master reference
*/
public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException {
PackagePart masterPart =
getRelatedPackagePart(master.getId2());
SldMasterDocument masterDoc =
SldMasterDocument.Factory.parse(masterPart.getInputStream());
return masterDoc.getSldMaster();
}
/**
* Returns the low level slide object from
* the supplied slide reference
*/
public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException {
PackagePart slidePart =
getRelatedPackagePart(slide.getId2());
SldDocument slideDoc =
SldDocument.Factory.parse(slidePart.getInputStream());
return slideDoc.getSld();
}
/**
* Returns the low level notes object for the given
* slide, as found from the supplied slide reference
*/
public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException {
PackagePart slidePart =
getRelatedPackagePart(slide.getId2());
PackageRelationshipCollection notes;
try {
notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE);
} catch(InvalidFormatException e) {
throw new IllegalStateException(e);
}
if(notes.size() == 0) {
// No notes for this slide
return null;
}
if(notes.size() > 1) {
throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size());
}
PackagePart notesPart =
getPackagePart(notes.getRelationship(0));
NotesDocument notesDoc =
NotesDocument.Factory.parse(notesPart.getInputStream());
return notesDoc.getNotes();
}
}

View File

@ -1,139 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFXML;
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
public class HXFPowerPointExtractor extends POIXMLTextExtractor {
private HSLFXMLSlideShow slideshow;
private boolean slidesByDefault = true;
private boolean notesByDefault = false;
public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new HSLFXMLSlideShow(
new HSLFXML(container)
));
}
public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
super(slideshow);
this.slideshow = slideshow;
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HXFPowerPointExtractor <filename.pptx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new HXFPowerPointExtractor(HXFDocument.openPackage(
new File(args[0])
));
System.out.println(extractor.getText());
}
/**
* Should a call to getText() return slide text?
* Default is yes
*/
public void setSlidesByDefault(boolean slidesByDefault) {
this.slidesByDefault = slidesByDefault;
}
/**
* Should a call to getText() return notes text?
* Default is no
*/
public void setNotesByDefault(boolean notesByDefault) {
this.notesByDefault = notesByDefault;
}
/**
* Gets the slide text, but not the notes text
*/
public String getText() {
return getText(slidesByDefault, notesByDefault);
}
/**
* Gets the requested text from the file
* @param slideText Should we retrieve text from slides?
* @param notesText Should we retrieve text from notes?
*/
public String getText(boolean slideText, boolean notesText) {
StringBuffer text = new StringBuffer();
CTSlideIdListEntry[] slideRefs =
slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
for (int i = 0; i < slideRefs.length; i++) {
try {
CTSlide slide =
slideshow._getHSLFXML().getSlide(slideRefs[i]);
CTNotesSlide notes =
slideshow._getHSLFXML().getNotes(slideRefs[i]);
if(slideText) {
extractText(slide.getCSld().getSpTree(), text);
}
if(notesText && notes != null) {
extractText(notes.getCSld().getSpTree(), text);
}
} catch(Exception e) {
throw new RuntimeException(e);
}
}
return text.toString();
}
private void extractText(CTGroupShape gs, StringBuffer text) {
CTShape[] shapes = gs.getSpArray();
for (int i = 0; i < shapes.length; i++) {
CTTextBody textBody =
shapes[i].getTxBody();
if(textBody != null) {
CTTextParagraph[] paras =
textBody.getPArray();
for (int j = 0; j < paras.length; j++) {
CTRegularTextRun[] textRuns =
paras[j].getRArray();
for (int k = 0; k < textRuns.length; k++) {
text.append( textRuns[k].getT() );
}
// End each paragraph with a new line
text.append("\n");
}
}
}
}
}

View File

@ -1,39 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hslf.HSLFXML;
/**
* High level representation of a ooxml slideshow.
* This is the first object most users will construct whether
* they are reading or writing a slideshow. It is also the
* top level object for creating new slides/etc.
*/
public class HSLFXMLSlideShow extends POIXMLDocument {
private org.apache.poi.hslf.HSLFXML hslfXML;
public HSLFXMLSlideShow(HSLFXML xml) {
super(xml);
this.hslfXML = xml;
}
public HSLFXML _getHSLFXML() {
return hslfXML;
}
}

View File

@ -1,104 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf;
import java.io.IOException;
import org.apache.poi.hssf.model.SharedStringsTable;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheets;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.WorkbookDocument;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.WorksheetDocument;
/**
* Experimental class to do low level processing
* of xlsx files.
*
* If you are using these low level classes, then you
* will almost certainly need to refer to the OOXML
* specifications from
* http://www.ecma-international.org/publications/standards/Ecma-376.htm
*
* WARNING - APIs expected to change rapidly
*/
public class HSSFXML extends HXFDocument {
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml";
public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml";
public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml";
public static final String SHARED_STRINGS_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings";
private WorkbookDocument workbookDoc;
private SharedStringsTable sharedStrings;
public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException {
super(container, MAIN_CONTENT_TYPE);
workbookDoc =
WorkbookDocument.Factory.parse(basePart.getInputStream());
PackagePart ssPart = getSinglePartByRelationType(SHARED_STRINGS_RELATION_TYPE, basePart);
if (ssPart != null) {
sharedStrings = new SharedStringsTable(ssPart);
} else {
}
}
/**
* Returns the low level workbook base object
*/
public CTWorkbook getWorkbook() {
return workbookDoc.getWorkbook();
}
/**
* Returns the references from the workbook to its
* sheets.
* You'll need these to figure out the sheet ordering,
* and to get at the actual sheets themselves
*/
public CTSheets getSheetReferences() {
return getWorkbook().getSheets();
}
/**
* Returns the low level (work)sheet object from
* the supplied sheet reference
*/
public CTWorksheet getSheet(CTSheet sheet) throws IOException, XmlException {
PackagePart sheetPart =
getRelatedPackagePart(sheet.getId());
WorksheetDocument sheetDoc =
WorksheetDocument.Factory.parse(sheetPart.getInputStream());
return sheetDoc.getWorksheet();
}
/**
* Returns the shared string at the given index
*/
public String getSharedString(int index) {
return this.sharedStrings.get(index);
}
protected SharedStringsTable _getSharedStringsTable() {
return sharedStrings;
}
}

View File

@ -1,133 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
/**
* Helper class to extract text from an OOXML Excel file
*/
public class HXFExcelExtractor extends POIXMLTextExtractor {
private HSSFXMLWorkbook workbook;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
public HXFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new HSSFXMLWorkbook(
new HSSFXML(container)
));
}
public HXFExcelExtractor(HSSFXMLWorkbook workbook) {
super(workbook);
this.workbook = workbook;
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HXFExcelExtractor <filename.xlsx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new HXFExcelExtractor(HXFDocument.openPackage(
new File(args[0])
));
System.out.println(extractor.getText());
}
/**
* Should sheet names be included? Default is true
*/
public void setIncludeSheetNames(boolean includeSheetNames) {
this.includeSheetNames = includeSheetNames;
}
/**
* Should we return the formula itself, and not
* the result it produces? Default is false
*/
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}
/**
* Retreives the text contents of the file
*/
public String getText() {
StringBuffer text = new StringBuffer();
CTSheet[] sheetRefs =
workbook._getHSSFXML().getSheetReferences().getSheetArray();
for(int i=0; i<sheetRefs.length; i++) {
try {
CTWorksheet sheet =
workbook._getHSSFXML().getSheet(sheetRefs[i]);
CTRow[] rows =
sheet.getSheetData().getRowArray();
if(i > 0) {
text.append("\n");
}
if(includeSheetNames) {
text.append(sheetRefs[i].getName() + "\n");
}
for(int j=0; j<rows.length; j++) {
CTCell[] cells = rows[j].getCArray();
for(int k=0; k<cells.length; k++) {
CTCell cell = cells[k];
if(k > 0) {
text.append("\t");
}
boolean done = false;
// Is it a formula one?
if(cell.getF() != null) {
if(formulasNotResults) {
text.append(cell.getF().getStringValue());
done = true;
}
}
if(!done) {
HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook);
text.append(uCell.getStringValue());
}
}
text.append("\n");
}
} catch(Exception e) {
throw new RuntimeException(e);
}
}
return text.toString();
}
}

View File

@ -1,78 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.model;
import java.io.IOException;
import java.io.OutputStream;
import java.util.LinkedList;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.opc.PackagePart;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSst;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.SstDocument;
public class SharedStringsTable extends LinkedList<String> {
public static final String MAIN_SML_NS_URI = "http://schemas.openxmlformats.org/spreadsheetml/2006/main";
private SstDocument doc;
private PackagePart part;
public SharedStringsTable(PackagePart part) throws IOException, XmlException {
this.part = part;
doc = SstDocument.Factory.parse(
part.getInputStream()
);
read();
}
private void read() {
CTRst[] sts = doc.getSst().getSiArray();
for (int i = 0; i < sts.length; i++) {
add(sts[i].getT());
}
}
/**
* Writes the current shared strings table into
* the associated OOXML PackagePart
*/
public void write() throws IOException {
CTSst sst = doc.getSst();
// Remove the old list
for(int i=sst.sizeOfSiArray() - 1; i>=0; i--) {
sst.removeSi(i);
}
// Add the new one
for(String s : this) {
sst.addNewSi().setT(s);
}
// Update the counts
sst.setCount(this.size());
sst.setUniqueCount(this.size());
// Write out
OutputStream out = part.getOutputStream();
doc.save(out);
out.close();
}
}

View File

@ -1,58 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.usermodel;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
/**
* User facing wrapper around an underlying cell object
*/
public class HSSFXMLCell {
private CTCell cell;
/** The workbook to which this cell belongs */
private final HSSFXMLWorkbook workbook;
public HSSFXMLCell(CTCell rawCell, HSSFXMLWorkbook workbook) {
this.cell = rawCell;
this.workbook = workbook;
}
/**
* Formats the cell's contents, based on its type,
* and returns it as a string.
*/
public String getStringValue() {
switch (cell.getT().intValue()) {
case STCellType.INT_S:
return this.workbook.getSharedString(Integer.valueOf(cell.getV()));
case STCellType.INT_INLINE_STR:
return cell.getV();
case STCellType.INT_N:
return cell.getV();
// TODO: support other types
default:
return "UNSUPPORTED CELL TYPE: '" + cell.getT() + "'";
}
}
public String toString() {
return cell.getR() + " - " + getStringValue();
}
}

View File

@ -1,43 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.HSSFXML;
/**
* High level representation of a ooxml workbook.
* This is the first object most users will construct whether
* they are reading or writing a workbook. It is also the
* top level object for creating new sheets/etc.
*/
public class HSSFXMLWorkbook extends POIXMLDocument {
private HSSFXML hssfXML;
public HSSFXMLWorkbook(HSSFXML xml) {
super(xml);
this.hssfXML = xml;
}
public HSSFXML _getHSSFXML() {
return hssfXML;
}
public String getSharedString(int index) {
return hssfXML.getSharedString(index);
}
}

View File

@ -1,92 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.IOException;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
/**
* Experimental class to do low level processing
* of docx files.
*
* If you are using these low level classes, then you
* will almost certainly need to refer to the OOXML
* specifications from
* http://www.ecma-international.org/publications/standards/Ecma-376.htm
*
* WARNING - APIs expected to change rapidly
*/
public class HWPFXML extends HXFDocument {
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml";
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
private DocumentDocument wordDoc;
public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException {
super(container, MAIN_CONTENT_TYPE);
wordDoc =
DocumentDocument.Factory.parse(basePart.getInputStream());
}
/**
* Returns the low level document base object
*/
public CTDocument1 getDocument() {
return wordDoc.getDocument();
}
/**
* Returns the low level body of the document
*/
public CTBody getDocumentBody() {
return getDocument().getBody();
}
/**
* Returns the styles object used
*/
public CTStyles getStyle() throws XmlException, IOException {
PackagePart[] parts;
try {
parts = getRelatedByType(STYLES_RELATION_TYPE);
} catch(InvalidFormatException e) {
throw new IllegalStateException(e);
}
if(parts.length != 1) {
throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length);
}
StylesDocument sd =
StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles();
}
}

View File

@ -1,87 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.HWPFXML;
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
/**
* Helper class to extract text from an OOXML Word file
*/
public class HXFWordExtractor extends POIXMLTextExtractor {
private HWPFXMLDocument document;
public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new HWPFXMLDocument(
new HWPFXML(container)
));
}
public HXFWordExtractor(HWPFXMLDocument document) {
super(document);
this.document = document;
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HXFWordExtractor <filename.xlsx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new HXFWordExtractor(HXFDocument.openPackage(
new File(args[0])
));
System.out.println(extractor.getText());
}
public String getText() {
CTBody body = document._getHWPFXML().getDocumentBody();
StringBuffer text = new StringBuffer();
// Loop over paragraphs
CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) {
// Loop over ranges
CTR[] rs = ps[i].getRArray();
for (int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
text.append(
texts[k].getStringValue()
);
}
}
// New line after each paragraph.
text.append("\n");
}
return text.toString();
}
}

View File

@ -1,36 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.HWPFXML;
/**
* High level representation of a ooxml text document.
*/
public class HWPFXMLDocument extends POIXMLDocument {
private HWPFXML hwpfXML;
public HWPFXMLDocument(HWPFXML xml) {
super(xml);
this.hwpfXML = xml;
}
public HWPFXML _getHWPFXML() {
return hwpfXML;
}
}

View File

@ -1,272 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hxf;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.ArrayList;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.storage.HeaderBlockConstants;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LongField;
import org.apache.xmlbeans.XmlException;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageAccess;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackagePartName;
import org.openxml4j.opc.PackageRelationship;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.PackagingURIHelper;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
/**
* Parent class of the low level interface to
* all POI XML (OOXML) implementations.
* Normal users should probably deal with things that
* extends {@link POIXMLDocument}, unless they really
* do need to get low level access to the files.
*
* If you are using these low level classes, then you
* will almost certainly need to refer to the OOXML
* specifications from
* http://www.ecma-international.org/publications/standards/Ecma-376.htm
*
* WARNING - APIs expected to change rapidly
*/
public abstract class HXFDocument {
public static final String CORE_PROPERTIES_REL_TYPE = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
public static final String EXTENDED_PROPERTIES_REL_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties";
/**
* File package/container.
*/
protected Package container;
/**
* The Package Part for our base document
*/
protected PackagePart basePart;
/**
* The base document of this instance, eg Workbook for
* xslsx
*/
protected Document baseDocument;
protected HXFDocument(Package container, String baseContentType) throws OpenXML4JException {
this.container = container;
// Find the base document
basePart = getSinglePartByType(baseContentType);
// And load it up
try {
SAXReader reader = new SAXReader();
baseDocument = reader.read(basePart.getInputStream());
} catch (DocumentException e) {
throw new OpenXML4JException(e.getMessage());
} catch (IOException ioe) {
throw new OpenXML4JException(ioe.getMessage());
}
}
/**
* Checks that the supplied InputStream (which MUST
* support mark and reset, or be a PushbackInputStream)
* has a OOXML (zip) header at the start of it.
* If your InputStream does not support mark / reset,
* then wrap it in a PushBackInputStream, then be
* sure to always use that, and not the original!
* @param inp An InputStream which supports either mark/reset, or is a PushbackInputStream
*/
public static boolean hasOOXMLHeader(InputStream inp) throws IOException {
// We want to peek at the first 4 bytes
inp.mark(4);
byte[] header = new byte[4];
IOUtils.readFully(inp, header);
// Wind back those 4 bytes
if(inp instanceof PushbackInputStream) {
PushbackInputStream pin = (PushbackInputStream)inp;
pin.unread(header);
} else {
inp.reset();
}
// Did it match the ooxml zip signature?
return (
header[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
header[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
header[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
header[3] == POIFSConstants.OOXML_FILE_HEADER[3]
);
}
/**
* Fetches the (single) PackagePart with the supplied
* content type.
* @param contentType The content type to search for
* @throws IllegalArgumentException If we don't find a single part of that type
*/
private PackagePart getSinglePartByType(String contentType) throws IllegalArgumentException {
ArrayList<PackagePart> parts =
container.getPartsByContentType(contentType);
if(parts.size() != 1) {
throw new IllegalArgumentException("Expecting one entry with content type of " + contentType + ", but found " + parts.size());
}
return parts.get(0);
}
/**
* Fetches the (single) PackagePart which is defined as
* the supplied relation content type of the specified part,
* or null if none found.
* @param relationType The relation content type to search for
* @throws IllegalArgumentException If we find more than one part of that type
* TODO: this sucks! Make Package and PackagePart implement common intf that defines getRelationshipsByType & friends
*/
protected PackagePart getSinglePartByRelationType(String relationType, PackagePart part) throws IllegalArgumentException, OpenXML4JException {
PackageRelationshipCollection rels =
part.getRelationshipsByType(relationType);
if(rels.size() == 0) {
return null;
}
if(rels.size() > 1) {
throw new IllegalArgumentException("Found " + rels.size() + " relations for the type " + relationType + ", should only ever be one!");
}
PackageRelationship rel = rels.getRelationship(0);
return getPackagePart(rel);
}
/**
* Fetches the (single) PackagePart which is defined as
* the supplied relation content type of the base
* container, or null if none found.
* @param relationType The relation content type to search for
* @throws IllegalArgumentException If we find more than one part of that type
*/
protected PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException {
PackageRelationshipCollection rels =
container.getRelationshipsByType(relationType);
if(rels.size() == 0) {
return null;
}
if(rels.size() > 1) {
throw new IllegalArgumentException("Found " + rels.size() + " relations for the type " + relationType + ", should only ever be one!");
}
PackageRelationship rel = rels.getRelationship(0);
return getPackagePart(rel);
}
/**
* Retrieves the PackagePart for the given relation
* id. This will normally come from a r:id attribute
* on part of the base document.
* @param partId The r:id pointing to the other PackagePart
*/
protected PackagePart getRelatedPackagePart(String partId) {
PackageRelationship rel =
basePart.getRelationship(partId);
return getPackagePart(rel);
}
/**
* Retrieves the PackagePart for the given Relationship
* object. Normally you'll want to go via a content type
* or r:id to get one of those.
*/
protected PackagePart getPackagePart(PackageRelationship rel) {
PackagePartName relName;
try {
relName = PackagingURIHelper.createPartName(rel.getTargetURI());
} catch(InvalidFormatException e) {
throw new InternalError(e.getMessage());
}
PackagePart part = container.getPart(relName);
if(part == null) {
throw new IllegalArgumentException("No part found for rel " + rel);
}
return part;
}
/**
* Retrieves all the PackageParts which are defined as
* relationships of the base document with the
* specified content type.
*/
protected PackagePart[] getRelatedByType(String contentType) throws InvalidFormatException {
PackageRelationshipCollection partsC =
basePart.getRelationshipsByType(contentType);
PackagePart[] parts = new PackagePart[partsC.size()];
int count = 0;
for (PackageRelationship rel : partsC) {
parts[count] = getPackagePart(rel);
count++;
}
return parts;
}
/**
* Get the package container.
* @return The package associated to this document.
*/
public Package getPackage() {
return container;
}
/**
* Get the core document properties (core ooxml properties).
*/
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, XmlException, IOException {
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
if(propsPart == null) {
return null;
}
return (PackagePropertiesPart)propsPart;
}
/**
* Get the extended document properties (extended ooxml properties)
*/
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
PropertiesDocument props = PropertiesDocument.Factory.parse(
propsPart.getInputStream());
return props.getProperties();
}
/**
* Returns an opened OOXML Package for the supplied File
* @param f File to open
*/
public static Package openPackage(File f) throws InvalidFormatException {
return Package.open(f.toString(), PackageAccess.READ_WRITE);
}
}

View File

@ -1,133 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hxf.dev;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageAccess;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationship;
import org.openxml4j.opc.PackageRelationshipCollection;
/**
* Prints out the contents of a HXF (ooxml) container.
* Useful for seeing what parts are defined, and how
* they're all related to each other.
*/
public class HXFLister {
private Package container;
private PrintStream disp;
public HXFLister(Package container) {
this(container, System.out);
}
public HXFLister(Package container, PrintStream disp) {
this.container = container;
this.disp = disp;
}
/**
* Figures out how big a given PackagePart is.
*/
public static long getSize(PackagePart part) throws IOException {
InputStream in = part.getInputStream();
byte[] b = new byte[8192];
long size = 0;
int read = 0;
while(read > -1) {
read = in.read(b);
if(read > 0) {
size += read;
}
}
return size;
}
/**
* Displays information on all the different
* parts of the OOXML file container.
*/
public void displayParts() throws Exception {
ArrayList<PackagePart> parts = container.getParts();
for (PackagePart part : parts) {
disp.println(part.getPartName());
disp.println("\t" + part.getContentType());
if(! part.getPartName().toString().equals("/docProps/core.xml")) {
disp.println("\t" + getSize(part) + " bytes");
}
if(! part.isRelationshipPart()) {
disp.println("\t" + part.getRelationships().size() + " relations");
for(PackageRelationship rel : part.getRelationships()) {
displayRelation(rel, "\t ");
}
}
}
}
/**
* Displays information on all the different
* relationships between different parts
* of the OOXML file container.
*/
public void displayRelations() throws Exception {
PackageRelationshipCollection rels =
container.getRelationships();
for (PackageRelationship rel : rels) {
displayRelation(rel, "");
}
}
private void displayRelation(PackageRelationship rel, String indent) {
disp.println(indent+"Relationship:");
disp.println(indent+"\tFrom: "+ rel.getSourceURI());
disp.println(indent+"\tTo: " + rel.getTargetURI());
disp.println(indent+"\tID: " + rel.getId());
disp.println(indent+"\tMode: " + rel.getTargetMode());
disp.println(indent+"\tType: " + rel.getRelationshipType());
}
public static void main(String[] args) throws Exception {
if(args.length == 0) {
System.err.println("Use:");
System.err.println("\tjava HXFLister <filename>");
System.exit(1);
}
File f = new File(args[0]);
if(! f.exists()) {
System.err.println("Error, file not found!");
System.err.println("\t" + f.toString());
System.exit(2);
}
HXFLister lister = new HXFLister(
Package.open(f.toString(), PackageAccess.READ)
);
lister.disp.println(f.toString() + "\n");
lister.displayParts();
lister.disp.println();
lister.displayRelations();
}
}

View File

@ -1,127 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf;
import java.io.File;
import org.apache.poi.hxf.HXFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
import junit.framework.TestCase;
public class TestHSLFXML extends TestCase {
private File sampleFile;
protected void setUp() throws Exception {
super.setUp();
sampleFile = new File(
System.getProperty("HSLF.testdata.path") +
File.separator + "sample.pptx"
);
}
public void testContainsMainContentType() throws Exception {
Package pack = HXFDocument.openPackage(sampleFile);
boolean found = false;
for(PackagePart part : pack.getParts()) {
if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) {
found = true;
}
System.out.println(part);
}
assertTrue(found);
}
public void testOpen() throws Exception {
HXFDocument.openPackage(sampleFile);
HSLFXML xml;
// With the finalised uri, should be fine
xml = new HSLFXML(
HXFDocument.openPackage(sampleFile)
);
// Check the core
assertNotNull(xml.getPresentation());
// Check it has some slides
assertTrue(
xml.getSlideReferences().sizeOfSldIdArray() > 0
);
assertTrue(
xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0
);
}
public void testSlideBasics() throws Exception {
HSLFXML xml = new HSLFXML(
HXFDocument.openPackage(sampleFile)
);
// Should have 1 master
assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray());
assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length);
// Should have three sheets
assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray());
assertEquals(2, xml.getSlideReferences().getSldIdArray().length);
// Check they're as expected
CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray();
assertEquals(256, slides[0].getId());
assertEquals(257, slides[1].getId());
assertEquals("rId2", slides[0].getId2());
assertEquals("rId3", slides[1].getId2());
// Now get those objects
assertNotNull(xml.getSlide(slides[0]));
assertNotNull(xml.getSlide(slides[1]));
// And check they have notes as expected
assertNotNull(xml.getNotes(slides[0]));
assertNotNull(xml.getNotes(slides[1]));
// And again for the master
CTSlideMasterIdListEntry[] masters =
xml.getSlideMasterReferences().getSldMasterIdArray();
assertEquals(2147483648l, masters[0].getId());
assertEquals("rId1", masters[0].getId2());
assertNotNull(xml.getSlideMaster(masters[0]));
}
public void testMetadataBasics() throws Exception {
HSLFXML xml = new HSLFXML(
HXFDocument.openPackage(sampleFile)
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
assertEquals(0, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
}
}

View File

@ -1,109 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf.extractor;
import java.io.File;
import org.apache.poi.hslf.HSLFXML;
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
import org.apache.poi.hxf.HXFDocument;
import junit.framework.TestCase;
/**
* Tests for HXFPowerPointExtractor
*/
public class TestHXFPowerPointExtractor extends TestCase {
/**
* A simple file
*/
private HSLFXML xmlA;
protected void setUp() throws Exception {
super.setUp();
File fileA = new File(
System.getProperty("HSLF.testdata.path") +
File.separator + "sample.pptx"
);
xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
new HXFPowerPointExtractor(xmlA.getPackage());
new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
HXFPowerPointExtractor extractor =
new HXFPowerPointExtractor(xmlA.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check Basics
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
assertTrue(text.endsWith("amet\n\n"));
// Just slides, no notes
text = extractor.getText(true, false);
assertEquals(
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n", text
);
// Just notes, no slides
text = extractor.getText(false, true);
assertEquals(
"\n\n\n\n", text
);
// Both
text = extractor.getText(true, true);
assertEquals(
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
"\n\n\n" +
"Lorem ipsum dolor sit amet\n" +
"Lorem\n" +
"ipsum\n" +
"dolor\n" +
"sit\n" +
"amet\n" +
"\n\n\n", text
);
// Via set defaults
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals(
"\n\n\n\n", text
);
}
}

View File

@ -1,160 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf;
import java.io.File;
import org.apache.poi.hssf.model.SharedStringsTable;
import org.apache.poi.hxf.HXFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import junit.framework.TestCase;
public class TestHSSFXML extends TestCase {
/**
* Uses the old style schemas.microsoft.com schema uri
*/
private File sampleFileBeta;
/**
* Uses the new style schemas.openxmlformats.org schema uri
*/
private File sampleFile;
protected void setUp() throws Exception {
super.setUp();
sampleFile = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "sample.xlsx"
);
sampleFileBeta = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "sample-beta.xlsx"
);
}
public void testContainsMainContentType() throws Exception {
Package pack = HXFDocument.openPackage(sampleFile);
boolean found = false;
for(PackagePart part : pack.getParts()) {
if(part.getContentType().equals(HSSFXML.MAIN_CONTENT_TYPE)) {
found = true;
}
System.out.println(part);
}
assertTrue(found);
}
public void testOpen() throws Exception {
HXFDocument.openPackage(sampleFile);
HXFDocument.openPackage(sampleFileBeta);
HSSFXML xml;
// With an old-style uri, as found in a file produced
// with the office 2007 beta, will fail, as we don't
// translate things
try {
xml = new HSSFXML(
HXFDocument.openPackage(sampleFileBeta)
);
fail();
} catch(Exception e) {}
// With the finalised uri, should be fine
xml = new HSSFXML(
HXFDocument.openPackage(sampleFile)
);
// Check it has a workbook
assertNotNull(xml.getWorkbook());
}
public void testSheetBasics() throws Exception {
HSSFXML xml = new HSSFXML(
HXFDocument.openPackage(sampleFile)
);
// Should have three sheets
assertEquals(3, xml.getSheetReferences().sizeOfSheetArray());
assertEquals(3, xml.getSheetReferences().getSheetArray().length);
// Check they're as expected
CTSheet[] sheets = xml.getSheetReferences().getSheetArray();
assertEquals("Sheet1", sheets[0].getName());
assertEquals("Sheet2", sheets[1].getName());
assertEquals("Sheet3", sheets[2].getName());
assertEquals("rId1", sheets[0].getId());
assertEquals("rId2", sheets[1].getId());
assertEquals("rId3", sheets[2].getId());
// Now get those objects
assertNotNull(xml.getSheet(sheets[0]));
assertNotNull(xml.getSheet(sheets[1]));
assertNotNull(xml.getSheet(sheets[2]));
}
public void testMetadataBasics() throws Exception {
HSSFXML xml = new HSSFXML(
HXFDocument.openPackage(sampleFile)
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertEquals("Microsoft Excel", xml.getExtendedProperties().getApplication());
assertEquals(0, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
}
public void testSharedStringBasics() throws Exception {
HSSFXML xml = new HSSFXML(
HXFDocument.openPackage(sampleFile)
);
assertNotNull(xml._getSharedStringsTable());
SharedStringsTable sst = xml._getSharedStringsTable();
assertEquals(10, sst.size());
assertEquals("Lorem", sst.get(0));
for(int i=0; i<sst.size(); i++) {
assertEquals(sst.get(i), xml.getSharedString(i));
}
// Add a few more, then save and reload, checking
// changes have been kept
sst.add("Foo");
sst.add("Bar");
sst.set(0, "LoremLorem");
sst.write();
xml = new HSSFXML(xml.getPackage());
sst = xml._getSharedStringsTable();
assertEquals(12, sst.size());
assertEquals("LoremLorem", sst.get(0));
for(int i=0; i<sst.size(); i++) {
assertEquals(sst.get(i), xml.getSharedString(i));
}
}
}

View File

@ -1,196 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
import org.apache.poi.hxf.HXFDocument;
/**
* Tests for HXFExcelExtractor
*/
public class TestHXFExcelExtractor extends TestCase {
/**
* A very simple file
*/
private HSSFXML xmlA;
/**
* A fairly complex file
*/
private HSSFXML xmlB;
/**
* A fairly simple file - ooxml
*/
private HSSFXML simpleXLSX;
/**
* A fairly simple file - ole2
*/
private HSSFWorkbook simpleXLS;
protected void setUp() throws Exception {
super.setUp();
File fileA = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "sample.xlsx"
);
File fileB = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "AverageTaxRates.xlsx"
);
File fileSOOXML = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "SampleSS.xlsx"
);
File fileSOLE2 = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "SampleSS.xls"
);
xmlA = new HSSFXML(HXFDocument.openPackage(fileA));
xmlB = new HSSFXML(HXFDocument.openPackage(fileB));
simpleXLSX = new HSSFXML(HXFDocument.openPackage(fileSOOXML));
simpleXLS = new HSSFWorkbook(new FileInputStream(fileSOLE2));
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
new HXFExcelExtractor(xmlA.getPackage());
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA));
HXFExcelExtractor extractor =
new HXFExcelExtractor(xmlA.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check sheet names
assertTrue(text.startsWith("Sheet1"));
assertTrue(text.endsWith("Sheet3\n"));
// Now without, will have text
extractor.setIncludeSheetNames(false);
text = extractor.getText();
assertEquals(
"Lorem\t111\n" +
"ipsum\t222\n" +
"dolor\t333\n" +
"sit\t444\n" +
"amet\t555\n" +
"consectetuer\t666\n" +
"adipiscing\t777\n" +
"elit\t888\n" +
"Nunc\t999\n" +
"at\t4995\n" +
"\n\n", text);
// Now get formulas not their values
extractor.setFormulasNotResults(true);
text = extractor.getText();
assertEquals(
"Lorem\t111\n" +
"ipsum\t222\n" +
"dolor\t333\n" +
"sit\t444\n" +
"amet\t555\n" +
"consectetuer\t666\n" +
"adipiscing\t777\n" +
"elit\t888\n" +
"Nunc\t999\n" +
"at\tSUM(B1:B9)\n" +
"\n\n", text);
// With sheet names too
extractor.setIncludeSheetNames(true);
text = extractor.getText();
assertEquals(
"Sheet1\n" +
"Lorem\t111\n" +
"ipsum\t222\n" +
"dolor\t333\n" +
"sit\t444\n" +
"amet\t555\n" +
"consectetuer\t666\n" +
"adipiscing\t777\n" +
"elit\t888\n" +
"Nunc\t999\n" +
"at\tSUM(B1:B9)\n\n" +
"Sheet2\n\n" +
"Sheet3\n"
, text);
}
public void testGetComplexText() throws Exception {
new HXFExcelExtractor(xmlB.getPackage());
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlB));
HXFExcelExtractor extractor =
new HXFExcelExtractor(xmlB.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Might not have all formatting it should do!
// TODO decide if we should really have the "null" in there
assertTrue(text.startsWith(
"Avgtxfull\n" +
"null\t(iii) AVERAGE TAX RATES ON ANNUAL"
));
}
/**
* Test that we return pretty much the same as
* ExcelExtractor does, when we're both passed
* the same file, just saved as xls and xlsx
*/
public void testComparedToOLE2() throws Exception {
HXFExcelExtractor ooxmlExtractor =
new HXFExcelExtractor(simpleXLSX.getPackage());
ExcelExtractor ole2Extractor =
new ExcelExtractor(simpleXLS);
POITextExtractor[] extractors =
new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
for (int i = 0; i < extractors.length; i++) {
POITextExtractor extractor = extractors[i];
String text = extractor.getText().replaceAll("[\r\t]", "");
//System.out.println(text.length());
//System.out.println(text);
assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
Matcher m = pattern.matcher(text);
assertTrue(m.matches());
}
}
}

View File

@ -1,110 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.File;
import org.apache.poi.hxf.HXFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import junit.framework.TestCase;
public class TestHWPFXML extends TestCase {
private File sampleFile;
private File complexFile;
protected void setUp() throws Exception {
super.setUp();
sampleFile = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "sample.docx"
);
complexFile = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
}
public void testContainsMainContentType() throws Exception {
Package pack = HXFDocument.openPackage(sampleFile);
boolean found = false;
for(PackagePart part : pack.getParts()) {
if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) {
found = true;
}
System.out.println(part);
}
assertTrue(found);
}
public void testOpen() throws Exception {
HXFDocument.openPackage(sampleFile);
HXFDocument.openPackage(complexFile);
HWPFXML xml;
// Simple file
xml = new HWPFXML(
HXFDocument.openPackage(sampleFile)
);
// Check it has key parts
assertNotNull(xml.getDocument());
assertNotNull(xml.getDocumentBody());
assertNotNull(xml.getStyle());
// Complex file
xml = new HWPFXML(
HXFDocument.openPackage(complexFile)
);
assertNotNull(xml.getDocument());
assertNotNull(xml.getDocumentBody());
assertNotNull(xml.getStyle());
}
public void testMetadataBasics() throws Exception {
HWPFXML xml = new HWPFXML(
HXFDocument.openPackage(sampleFile)
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
assertEquals(1315, xml.getExtendedProperties().getCharacters());
assertEquals(10, xml.getExtendedProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
}
public void testMetadataComplex() throws Exception {
HWPFXML xml = new HWPFXML(
HXFDocument.openPackage(complexFile)
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
assertEquals(5184, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
}
}

View File

@ -1,117 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
import org.apache.poi.hwpf.HWPFXML;
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
import org.apache.poi.hxf.HXFDocument;
import junit.framework.TestCase;
/**
* Tests for HXFWordExtractor
*/
public class TestHXFWordExtractor extends TestCase {
/**
* A very simple file
*/
private HWPFXML xmlA;
/**
* A fairly complex file
*/
private HWPFXML xmlB;
protected void setUp() throws Exception {
super.setUp();
File fileA = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "sample.docx"
);
File fileB = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
new HXFWordExtractor(xmlA.getPackage());
new HXFWordExtractor(new HWPFXMLDocument(xmlA));
HXFWordExtractor extractor =
new HXFWordExtractor(xmlA.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check contents
assertTrue(text.startsWith(
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
));
assertTrue(text.endsWith(
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
));
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; }
}
assertEquals(3, ps);
}
/**
* Tests getting the text out of a complex file
*/
public void testGetComplexText() throws Exception {
HXFWordExtractor extractor =
new HXFWordExtractor(xmlB.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
char euro = '\u20ac';
System.err.println("'"+text.substring(text.length() - 20) + "'");
// Check contents
assertTrue(text.startsWith(
" \n(V) ILLUSTRATIVE CASES\n\n"
));
assertTrue(text.endsWith(
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
));
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; }
}
assertEquals(79, ps);
}
}

View File

@ -1,65 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hxf;
import junit.framework.TestCase;
import java.io.*;
/**
* Class to test that HXF correctly detects OOXML
* documents
*/
public class TestDetectAsOOXML extends TestCase
{
public String dirname;
public void setUp() {
dirname = System.getProperty("HSSF.testdata.path");
}
public void testOpensProperly() throws Exception
{
File f = new File(dirname + "/sample.xlsx");
HXFDocument.openPackage(f);
}
public void testDetectAsPOIFS() throws Exception {
InputStream in;
// ooxml file is
in = new PushbackInputStream(
new FileInputStream(dirname + "/SampleSS.xlsx"), 10
);
assertTrue(HXFDocument.hasOOXMLHeader(in));
// xls file isn't
in = new PushbackInputStream(
new FileInputStream(dirname + "/SampleSS.xls"), 10
);
assertFalse(HXFDocument.hasOOXMLHeader(in));
// text file isn't
in = new PushbackInputStream(
new FileInputStream(dirname + "/SampleSS.txt"), 10
);
assertFalse(HXFDocument.hasOOXMLHeader(in));
}
}