Support for extracting text from ooxml word documents
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607560 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40235d1b5e
commit
faf602abf5
@ -16,12 +16,14 @@
|
||||
==================================================================== */
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hssf.HSSFXML;
|
||||
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
|
||||
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.openxml4j.opc.Package;
|
||||
@ -30,6 +32,9 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
|
||||
|
||||
/**
|
||||
* Helper class to extract text from an OOXML Excel file
|
||||
*/
|
||||
public class HXFExcelExtractor extends POIXMLTextExtractor {
|
||||
private HSSFXMLWorkbook workbook;
|
||||
private boolean includeSheetNames = true;
|
||||
@ -44,6 +49,19 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
|
||||
super(workbook);
|
||||
this.workbook = workbook;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" HXFExcelExtractor <filename.xlsx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new HXFExcelExtractor(HXFDocument.openPackage(
|
||||
new File(args[0])
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Should sheet names be included? Default is true
|
||||
|
@ -19,6 +19,12 @@ package org.apache.poi.hssf.usermodel;
|
||||
import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.hssf.HSSFXML;
|
||||
|
||||
/**
|
||||
* High level representation of a ooxml workbook.
|
||||
* This is the first object most users will construct whether
|
||||
* they are reading or writing a workbook. It is also the
|
||||
* top level object for creating new sheets/etc.
|
||||
*/
|
||||
public class HSSFXMLWorkbook extends POIXMLDocument {
|
||||
private HSSFXML hssfXML;
|
||||
|
||||
|
@ -0,0 +1,87 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFXML;
|
||||
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.openxml4j.opc.Package;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||
|
||||
/**
|
||||
* Helper class to extract text from an OOXML Word file
|
||||
*/
|
||||
public class HXFWordExtractor extends POIXMLTextExtractor {
|
||||
private HWPFXMLDocument document;
|
||||
|
||||
public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||
this(new HWPFXMLDocument(
|
||||
new HWPFXML(container)
|
||||
));
|
||||
}
|
||||
public HXFWordExtractor(HWPFXMLDocument document) {
|
||||
super(document);
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" HXFWordExtractor <filename.xlsx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new HXFWordExtractor(HXFDocument.openPackage(
|
||||
new File(args[0])
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
CTBody body = document._getHWPFXML().getDocumentBody();
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
// Loop over paragraphs
|
||||
CTP[] ps = body.getPArray();
|
||||
for (int i = 0; i < ps.length; i++) {
|
||||
// Loop over ranges
|
||||
CTR[] rs = ps[i].getRArray();
|
||||
for (int j = 0; j < rs.length; j++) {
|
||||
// Loop over text runs
|
||||
CTText[] texts = rs[j].getTArray();
|
||||
for (int k = 0; k < texts.length; k++) {
|
||||
text.append(
|
||||
texts[k].getStringValue()
|
||||
);
|
||||
}
|
||||
}
|
||||
// New line after each paragraph.
|
||||
text.append("\n");
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.hwpf.HWPFXML;
|
||||
|
||||
/**
|
||||
* High level representation of a ooxml text document.
|
||||
*/
|
||||
public class HWPFXMLDocument extends POIXMLDocument {
|
||||
private HWPFXML hwpfXML;
|
||||
|
||||
public HWPFXMLDocument(HWPFXML xml) {
|
||||
super(xml);
|
||||
this.hwpfXML = xml;
|
||||
}
|
||||
|
||||
public HWPFXML _getHWPFXML() {
|
||||
return hwpfXML;
|
||||
}
|
||||
}
|
@ -18,7 +18,6 @@ package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.poi.hssf.HSSFXML;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
import org.openxml4j.opc.Package;
|
||||
import org.openxml4j.opc.PackagePart;
|
||||
|
@ -0,0 +1,117 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFXML;
|
||||
import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests for HXFWordExtractor
|
||||
*/
|
||||
public class TestHXFWordExtractor extends TestCase {
|
||||
/**
|
||||
* A very simple file
|
||||
*/
|
||||
private HWPFXML xmlA;
|
||||
/**
|
||||
* A fairly complex file
|
||||
*/
|
||||
private HWPFXML xmlB;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
File fileA = new File(
|
||||
System.getProperty("HWPF.testdata.path") +
|
||||
File.separator + "sample.docx"
|
||||
);
|
||||
File fileB = new File(
|
||||
System.getProperty("HWPF.testdata.path") +
|
||||
File.separator + "IllustrativeCases.docx"
|
||||
);
|
||||
|
||||
xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
|
||||
xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get text out of the simple file
|
||||
*/
|
||||
public void testGetSimpleText() throws Exception {
|
||||
new HXFWordExtractor(xmlA.getPackage());
|
||||
new HXFWordExtractor(new HWPFXMLDocument(xmlA));
|
||||
|
||||
HXFWordExtractor extractor =
|
||||
new HXFWordExtractor(xmlA.getPackage());
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
// Check contents
|
||||
assertTrue(text.startsWith(
|
||||
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
|
||||
));
|
||||
assertTrue(text.endsWith(
|
||||
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
|
||||
));
|
||||
|
||||
// Check number of paragraphs
|
||||
int ps = 0;
|
||||
char[] t = text.toCharArray();
|
||||
for (int i = 0; i < t.length; i++) {
|
||||
if(t[i] == '\n') { ps++; }
|
||||
}
|
||||
assertEquals(3, ps);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests getting the text out of a complex file
|
||||
*/
|
||||
public void testGetComplexText() throws Exception {
|
||||
HXFWordExtractor extractor =
|
||||
new HXFWordExtractor(xmlB.getPackage());
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
char euro = '\u20ac';
|
||||
System.err.println("'"+text.substring(text.length() - 20) + "'");
|
||||
|
||||
// Check contents
|
||||
assertTrue(text.startsWith(
|
||||
" \n(V) ILLUSTRATIVE CASES\n\n"
|
||||
));
|
||||
assertTrue(text.endsWith(
|
||||
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
|
||||
));
|
||||
|
||||
// Check number of paragraphs
|
||||
int ps = 0;
|
||||
char[] t = text.toCharArray();
|
||||
for (int i = 0; i < t.length; i++) {
|
||||
if(t[i] == '\n') { ps++; }
|
||||
}
|
||||
assertEquals(79, ps);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user