Support for extracting text from ooxml word documents
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40235d1b5e
commit
faf602abf5
@ -16,12 +16,14 @@
|
|||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.hssf.extractor;
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.hssf.HSSFXML;
|
import org.apache.poi.hssf.HSSFXML;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
|
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.openxml4j.exceptions.OpenXML4JException;
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.openxml4j.opc.Package;
|
import org.openxml4j.opc.Package;
|
||||||
@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
|
|||||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
||||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class to extract text from an OOXML Excel file
|
||||||
|
*/
|
||||||
public class HXFExcelExtractor extends POIXMLTextExtractor {
|
public class HXFExcelExtractor extends POIXMLTextExtractor {
|
||||||
private HSSFXMLWorkbook workbook;
|
private HSSFXMLWorkbook workbook;
|
||||||
private boolean includeSheetNames = true;
|
private boolean includeSheetNames = true;
|
||||||
@ -45,6 +50,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
|
|||||||
this.workbook = workbook;
|
this.workbook = workbook;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if(args.length < 1) {
|
||||||
|
System.err.println("Use:");
|
||||||
|
System.err.println(" HXFExcelExtractor <filename.xlsx>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
POIXMLTextExtractor extractor =
|
||||||
|
new HXFExcelExtractor(HXFDocument.openPackage(
|
||||||
|
new File(args[0])
|
||||||
|
));
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should sheet names be included? Default is true
|
* Should sheet names be included? Default is true
|
||||||
*/
|
*/
|
||||||
|
@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel;
|
|||||||
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLDocument;
|
||||||
import org.apache.poi.hssf.HSSFXML;
|
import org.apache.poi.hssf.HSSFXML;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* High level representation of a ooxml workbook.
|
||||||
|
* This is the first object most users will construct whether
|
||||||
|
* they are reading or writing a workbook. It is also the
|
||||||
|
* top level object for creating new sheets/etc.
|
||||||
|
*/
|
||||||
public class HSSFXMLWorkbook extends POIXMLDocument {
|
public class HSSFXMLWorkbook extends POIXMLDocument {
|
||||||
private HSSFXML hssfXML;
|
private HSSFXML hssfXML;
|
||||||
|
|
||||||
|
@ -0,0 +1,87 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
|
import org.apache.poi.hwpf.HWPFXML;
|
||||||
|
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
import org.openxml4j.opc.Package;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class to extract text from an OOXML Word file
|
||||||
|
*/
|
||||||
|
public class HXFWordExtractor extends POIXMLTextExtractor {
|
||||||
|
private HWPFXMLDocument document;
|
||||||
|
|
||||||
|
public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||||
|
this(new HWPFXMLDocument(
|
||||||
|
new HWPFXML(container)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public HXFWordExtractor(HWPFXMLDocument document) {
|
||||||
|
super(document);
|
||||||
|
this.document = document;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if(args.length < 1) {
|
||||||
|
System.err.println("Use:");
|
||||||
|
System.err.println(" HXFWordExtractor <filename.xlsx>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
POIXMLTextExtractor extractor =
|
||||||
|
new HXFWordExtractor(HXFDocument.openPackage(
|
||||||
|
new File(args[0])
|
||||||
|
));
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getText() {
|
||||||
|
CTBody body = document._getHWPFXML().getDocumentBody();
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
// Loop over paragraphs
|
||||||
|
CTP[] ps = body.getPArray();
|
||||||
|
for (int i = 0; i < ps.length; i++) {
|
||||||
|
// Loop over ranges
|
||||||
|
CTR[] rs = ps[i].getRArray();
|
||||||
|
for (int j = 0; j < rs.length; j++) {
|
||||||
|
// Loop over text runs
|
||||||
|
CTText[] texts = rs[j].getTArray();
|
||||||
|
for (int k = 0; k < texts.length; k++) {
|
||||||
|
text.append(
|
||||||
|
texts[k].getStringValue()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// New line after each paragraph.
|
||||||
|
text.append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFXML;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* High level representation of a ooxml text document.
|
||||||
|
*/
|
||||||
|
public class HWPFXMLDocument extends POIXMLDocument {
|
||||||
|
private HWPFXML hwpfXML;
|
||||||
|
|
||||||
|
public HWPFXMLDocument(HWPFXML xml) {
|
||||||
|
super(xml);
|
||||||
|
this.hwpfXML = xml;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HWPFXML _getHWPFXML() {
|
||||||
|
return hwpfXML;
|
||||||
|
}
|
||||||
|
}
|
@ -18,7 +18,6 @@ package org.apache.poi.hwpf;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
import org.apache.poi.hssf.HSSFXML;
|
|
||||||
import org.apache.poi.hxf.HXFDocument;
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
import org.openxml4j.opc.Package;
|
import org.openxml4j.opc.Package;
|
||||||
import org.openxml4j.opc.PackagePart;
|
import org.openxml4j.opc.PackagePart;
|
||||||
|
@ -0,0 +1,117 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.HWPFXML;
|
||||||
|
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for HXFWordExtractor
|
||||||
|
*/
|
||||||
|
public class TestHXFWordExtractor extends TestCase {
|
||||||
|
/**
|
||||||
|
* A very simple file
|
||||||
|
*/
|
||||||
|
private HWPFXML xmlA;
|
||||||
|
/**
|
||||||
|
* A fairly complex file
|
||||||
|
*/
|
||||||
|
private HWPFXML xmlB;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
File fileA = new File(
|
||||||
|
System.getProperty("HWPF.testdata.path") +
|
||||||
|
File.separator + "sample.docx"
|
||||||
|
);
|
||||||
|
File fileB = new File(
|
||||||
|
System.getProperty("HWPF.testdata.path") +
|
||||||
|
File.separator + "IllustrativeCases.docx"
|
||||||
|
);
|
||||||
|
|
||||||
|
xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
|
||||||
|
xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get text out of the simple file
|
||||||
|
*/
|
||||||
|
public void testGetSimpleText() throws Exception {
|
||||||
|
new HXFWordExtractor(xmlA.getPackage());
|
||||||
|
new HXFWordExtractor(new HWPFXMLDocument(xmlA));
|
||||||
|
|
||||||
|
HXFWordExtractor extractor =
|
||||||
|
new HXFWordExtractor(xmlA.getPackage());
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
|
// Check contents
|
||||||
|
assertTrue(text.startsWith(
|
||||||
|
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
|
||||||
|
));
|
||||||
|
assertTrue(text.endsWith(
|
||||||
|
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
|
||||||
|
));
|
||||||
|
|
||||||
|
// Check number of paragraphs
|
||||||
|
int ps = 0;
|
||||||
|
char[] t = text.toCharArray();
|
||||||
|
for (int i = 0; i < t.length; i++) {
|
||||||
|
if(t[i] == '\n') { ps++; }
|
||||||
|
}
|
||||||
|
assertEquals(3, ps);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests getting the text out of a complex file
|
||||||
|
*/
|
||||||
|
public void testGetComplexText() throws Exception {
|
||||||
|
HXFWordExtractor extractor =
|
||||||
|
new HXFWordExtractor(xmlB.getPackage());
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
|
char euro = '\u20ac';
|
||||||
|
System.err.println("'"+text.substring(text.length() - 20) + "'");
|
||||||
|
|
||||||
|
// Check contents
|
||||||
|
assertTrue(text.startsWith(
|
||||||
|
" \n(V) ILLUSTRATIVE CASES\n\n"
|
||||||
|
));
|
||||||
|
assertTrue(text.endsWith(
|
||||||
|
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
|
||||||
|
));
|
||||||
|
|
||||||
|
// Check number of paragraphs
|
||||||
|
int ps = 0;
|
||||||
|
char[] t = text.toCharArray();
|
||||||
|
for (int i = 0; i < t.length; i++) {
|
||||||
|
if(t[i] == '\n') { ps++; }
|
||||||
|
}
|
||||||
|
assertEquals(79, ps);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user