New event based xssf text extractor (XSSFEventBasedExcelExtractor)

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@903182 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-26 11:39:44 +00:00
parent 90700558cd
commit a2e6cafca9
7 changed files with 513 additions and 8 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">New event based xssf text extractor (XSSFEventBasedExcelExtractor)</action>
<action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>

View File

@ -50,6 +50,7 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@ -161,8 +162,7 @@ public class ExtractorFactory {
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
if(getPreferEventExtractor()) {
// TODO
return new XSSFExcelExtractor(pkg);
return new XSSFEventBasedExcelExtractor(pkg);
} else {
return new XSSFExcelExtractor(pkg);
}

View File

@ -0,0 +1,357 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.extractor;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
/**
* Implementation of a text extractor from OOXML Excel
* files that uses SAX event based parsing.
*/
public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
private OPCPackage container;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
/**
* These are the different kinds of cells we support.
* We keep track of the current one between
* the start and end.
*/
enum xssfDataType {
BOOLEAN,
ERROR,
FORMULA,
INLINE_STRING,
SST_STRING,
NUMBER,
}
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path));
}
public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
super(null);
this.container = container;
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new XSSFEventBasedExcelExtractor(args[0]);
System.out.println(extractor.getText());
}
/**
* Should sheet names be included? Default is true
*/
public void setIncludeSheetNames(boolean includeSheetNames) {
this.includeSheetNames = includeSheetNames;
}
/**
* Should we return the formula itself, and not
* the result it produces? Default is false
*/
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}
/**
* Handler for sheets. Processes each row and cell,
* formatting Cells as best as it can.
*/
class MyXSSFSheetHandler extends DefaultHandler {
/**
* Table with the styles used for formatting
*/
private StylesTable stylesTable;
private ReadOnlySharedStringsTable sharedStringsTable;
/**
* Where our text is going
*/
private final StringBuffer output;
// Set when V start element is seen
private boolean vIsOpen;
// Set when F start element is seen
private boolean fIsOpen;
// Set when cell start element is seen;
// used when cell close element is seen.
private xssfDataType nextDataType;
// Used to format numeric cell values.
private short formatIndex;
private String formatString;
private final DataFormatter formatter;
// Gathers characters as they are seen.
private StringBuffer value = new StringBuffer();
private StringBuffer formula = new StringBuffer();
private boolean firstCellOfRow = true;
/**
* Accepts objects needed while parsing.
*
* @param styles Table of styles
* @param strings Table of shared strings
* @param cols Minimum number of columns to show
* @param target Sink for output
*/
public MyXSSFSheetHandler(
StylesTable styles,
ReadOnlySharedStringsTable strings,
StringBuffer output) {
this.stylesTable = styles;
this.sharedStringsTable = strings;
this.output = output;
this.nextDataType = xssfDataType.NUMBER;
this.formatter = new DataFormatter();
}
public void startElement(String uri, String localName, String name,
Attributes attributes) throws SAXException {
if ("inlineStr".equals(name) || "v".equals(name)) {
vIsOpen = true;
// Clear contents cache
value.setLength(0);
} else if ("f".equals(name)) {
// Clear contents cache
formula.setLength(0);
// Mark us as being a formula if not already
if(nextDataType == xssfDataType.NUMBER) {
nextDataType = xssfDataType.FORMULA;
}
// Decide where to get the formula string from
String type = attributes.getValue("t");
if(type != null && type.equals("shared")) {
System.err.println("Warning - shared formulas not yet supported!");
} else {
fIsOpen = true;
}
}
else if("row".equals(name)) {
firstCellOfRow = true;
}
// c => cell
else if ("c".equals(name)) {
// Set up defaults.
this.nextDataType = xssfDataType.NUMBER;
this.formatIndex = -1;
this.formatString = null;
String cellType = attributes.getValue("t");
String cellStyleStr = attributes.getValue("s");
if ("b".equals(cellType))
nextDataType = xssfDataType.BOOLEAN;
else if ("e".equals(cellType))
nextDataType = xssfDataType.ERROR;
else if ("inlineStr".equals(cellType))
nextDataType = xssfDataType.INLINE_STRING;
else if ("s".equals(cellType))
nextDataType = xssfDataType.SST_STRING;
else if ("str".equals(cellType))
nextDataType = xssfDataType.FORMULA;
else if (cellStyleStr != null) {
// Number, but almost certainly with a special style or format
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
this.formatIndex = style.getDataFormat();
this.formatString = style.getDataFormatString();
if (this.formatString == null)
this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
}
}
}
public void endElement(String uri, String localName, String name)
throws SAXException {
String thisStr = null;
// v => contents of a cell
if ("v".equals(name)) {
vIsOpen = false;
// Process the value contents as required, now we have it all
switch (nextDataType) {
case BOOLEAN:
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR:
thisStr = "ERROR:" + value.toString();
break;
case FORMULA:
if(formulasNotResults) {
thisStr = formula.toString();
} else {
thisStr = value.toString();
}
break;
case INLINE_STRING:
// TODO: have seen an example of this, so it's untested.
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
thisStr = rtsi.toString();
break;
case SST_STRING:
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
thisStr = rtss.toString();
}
catch (NumberFormatException ex) {
System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
}
break;
case NUMBER:
String n = value.toString();
if (this.formatString != null)
thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
else
thisStr = n;
break;
default:
thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
break;
}
// Output
if(!firstCellOfRow) {
output.append('\t');
}
firstCellOfRow = false;
output.append(thisStr);
} else if ("f".equals(name)) {
fIsOpen = false;
} else if ("row".equals(name)) {
// Finish the line
output.append('\n');
}
}
/**
* Captures characters only if a suitable element is open.
* Originally was just "v"; extended for inlineStr also.
*/
public void characters(char[] ch, int start, int length)
throws SAXException {
if (vIsOpen) {
value.append(ch, start, length);
}
if (fIsOpen) {
formula.append(ch, start, length);
}
}
}
/**
* Processes the given sheet
*/
public void processSheet(
StringBuffer output,
StylesTable styles,
ReadOnlySharedStringsTable strings,
InputStream sheetInputStream)
throws IOException, SAXException {
InputSource sheetSource = new InputSource(sheetInputStream);
SAXParserFactory saxFactory = SAXParserFactory.newInstance();
try {
SAXParser saxParser = saxFactory.newSAXParser();
XMLReader sheetParser = saxParser.getXMLReader();
ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
/**
* Processes the file and returns the text
*/
public String getText() {
try {
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
XSSFReader xssfReader = new XSSFReader(container);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
StringBuffer text = new StringBuffer();
while (iter.hasNext()) {
InputStream stream = iter.next();
if(includeSheetNames) {
text.append(iter.getSheetName());
text.append('\n');
}
processSheet(text, styles, strings, stream);
stream.close();
}
return text.toString();
} catch(IOException e) {
System.err.println(e);
return null;
} catch(SAXException se) {
System.err.println(se);
return null;
} catch(OpenXML4JException o4je) {
System.err.println(o4je);
return null;
}
}
}

View File

@ -56,7 +56,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HXFExcelExtractor <filename.xlsx>");
System.err.println(" XSSFExcelExtractor <filename.xlsx>");
System.exit(1);
}
POIXMLTextExtractor extractor =

View File

@ -32,6 +32,7 @@ import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@ -427,7 +428,7 @@ public class TestExtractorFactory extends TestCase {
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
instanceof XSSFExcelExtractor // TODO
instanceof XSSFEventBasedExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200

View File

@ -19,18 +19,14 @@ package org.apache.poi.xssf;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.util.TempFile;
/**
* Centralises logic for finding/opening sample files in the src/testcases/org/apache/poi/hssf/hssf/data folder.
@ -39,6 +35,15 @@ import org.apache.poi.util.TempFile;
*/
public class XSSFTestDataSamples {
public static OPCPackage openSamplePackage(String sampleName) {
try {
return OPCPackage.open(
HSSFTestDataSamples.openSampleFileStream(sampleName)
);
} catch(Exception e) {
throw new RuntimeException(e);
}
}
public static XSSFWorkbook openSampleWorkbook(String sampleName) {
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleName);
try {

View File

@ -0,0 +1,141 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.extractor;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.xssf.XSSFTestDataSamples;
/**
* Tests for {@link XSSFEventBasedExcelExtractor}
*/
public final class TestXSSFEventBasedExcelExtractor extends TestCase {
private static final XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws Exception {
return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples.
openSamplePackage(sampleName));
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
// a very simple file
XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx");
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check sheet names
assertTrue(text.startsWith("Sheet1"));
assertTrue(text.endsWith("Sheet3\n"));
// Now without, will have text
extractor.setIncludeSheetNames(false);
text = extractor.getText();
String CHUNK1 =
"Lorem\t111\n" +
"ipsum\t222\n" +
"dolor\t333\n" +
"sit\t444\n" +
"amet\t555\n" +
"consectetuer\t666\n" +
"adipiscing\t777\n" +
"elit\t888\n" +
"Nunc\t999\n";
String CHUNK2 =
"The quick brown fox jumps over the lazy dog\n" +
"hello, xssf hello, xssf\n" +
"hello, xssf hello, xssf\n" +
"hello, xssf hello, xssf\n" +
"hello, xssf hello, xssf\n";
assertEquals(
CHUNK1 +
"at\t4995\n" +
CHUNK2
, text);
// Now get formulas not their values
extractor.setFormulasNotResults(true);
text = extractor.getText();
assertEquals(
CHUNK1 +
"at\tSUM(B1:B9)\n" +
CHUNK2, text);
// With sheet names too
extractor.setIncludeSheetNames(true);
text = extractor.getText();
assertEquals(
"Sheet1\n" +
CHUNK1 +
"at\tSUM(B1:B9)\n" +
"rich test\n" +
CHUNK2 +
"Sheet3\n"
, text);
}
public void testGetComplexText() throws Exception {
// A fairly complex file
XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx");
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Might not have all formatting it should do!
assertTrue(text.startsWith(
"Avgtxfull\n" +
"(iii) AVERAGE TAX RATES ON ANNUAL"
));
}
/**
* Test that we return pretty much the same as
* ExcelExtractor does, when we're both passed
* the same file, just saved as xls and xlsx
*/
public void testComparedToOLE2() throws Exception {
// A fairly simple file - ooxml
XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx");
ExcelExtractor ole2Extractor =
new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls"));
POITextExtractor[] extractors =
new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
for (int i = 0; i < extractors.length; i++) {
POITextExtractor extractor = extractors[i];
String text = extractor.getText().replaceAll("[\r\t]", "");
assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
Matcher m = pattern.matcher(text);
assertTrue(m.matches());
}
}
}