From a2e6cafca9e1d652b53d7d8476b009d6e62d2cb2 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 26 Jan 2010 11:39:44 +0000 Subject: [PATCH] New event based xssf text extractor (XSSFEventBasedExcelExtractor) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@903182 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../poi/extractor/ExtractorFactory.java | 4 +- .../XSSFEventBasedExcelExtractor.java | 357 ++++++++++++++++++ .../xssf/extractor/XSSFExcelExtractor.java | 2 +- .../poi/extractor/TestExtractorFactory.java | 3 +- .../apache/poi/xssf/XSSFTestDataSamples.java | 13 +- .../TestXSSFEventBasedExcelExtractor.java | 141 +++++++ 7 files changed, 513 insertions(+), 8 deletions(-) create mode 100644 src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java create mode 100644 src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 7842c6136..79da11ad8 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + New event based xssf text extractor (XSSFEventBasedExcelExtractor) ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis 48544 - avoid failures in XLSX2CSV when shared string table is missing 48571 - properly close all IO streams created in OPCPackage diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index ed7f22ac8..ee3da8c61 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -50,6 +50,7 @@ import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; @@ -161,8 +162,7 @@ public class ExtractorFactory { corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) || corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) { if(getPreferEventExtractor()) { - // TODO - return new XSSFExcelExtractor(pkg); + return new XSSFEventBasedExcelExtractor(pkg); } else { return new XSSFExcelExtractor(pkg); } diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java new file mode 100644 index 000000000..9be971189 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -0,0 +1,357 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xssf.extractor; + +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.ss.usermodel.BuiltinFormats; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; +import org.apache.poi.xssf.eventusermodel.XSSFReader; +import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFCellStyle; +import org.apache.poi.xssf.usermodel.XSSFRichTextString; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Implementation of a text extractor from OOXML Excel + * files that uses SAX event based parsing. + */ +public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { + private OPCPackage container; + private boolean includeSheetNames = true; + private boolean formulasNotResults = false; + + /** + * These are the different kinds of cells we support. + * We keep track of the current one between + * the start and end. + */ + enum xssfDataType { + BOOLEAN, + ERROR, + FORMULA, + INLINE_STRING, + SST_STRING, + NUMBER, + } + + public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { + this(OPCPackage.open(path)); + } + public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { + super(null); + this.container = container; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" XSSFEventBasedExcelExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new XSSFEventBasedExcelExtractor(args[0]); + System.out.println(extractor.getText()); + } + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + this.includeSheetNames = includeSheetNames; + } + /** + * Should we return the formula itself, and not + * the result it produces? Default is false + */ + public void setFormulasNotResults(boolean formulasNotResults) { + this.formulasNotResults = formulasNotResults; + } + + + /** + * Handler for sheets. Processes each row and cell, + * formatting Cells as best as it can. + */ + class MyXSSFSheetHandler extends DefaultHandler { + /** + * Table with the styles used for formatting + */ + private StylesTable stylesTable; + + private ReadOnlySharedStringsTable sharedStringsTable; + + /** + * Where our text is going + */ + private final StringBuffer output; + + // Set when V start element is seen + private boolean vIsOpen; + // Set when F start element is seen + private boolean fIsOpen; + + // Set when cell start element is seen; + // used when cell close element is seen. + private xssfDataType nextDataType; + + // Used to format numeric cell values. + private short formatIndex; + private String formatString; + private final DataFormatter formatter; + + // Gathers characters as they are seen. + private StringBuffer value = new StringBuffer(); + private StringBuffer formula = new StringBuffer(); + private boolean firstCellOfRow = true; + + /** + * Accepts objects needed while parsing. + * + * @param styles Table of styles + * @param strings Table of shared strings + * @param cols Minimum number of columns to show + * @param target Sink for output + */ + public MyXSSFSheetHandler( + StylesTable styles, + ReadOnlySharedStringsTable strings, + StringBuffer output) { + this.stylesTable = styles; + this.sharedStringsTable = strings; + this.output = output; + this.nextDataType = xssfDataType.NUMBER; + this.formatter = new DataFormatter(); + } + + public void startElement(String uri, String localName, String name, + Attributes attributes) throws SAXException { + + if ("inlineStr".equals(name) || "v".equals(name)) { + vIsOpen = true; + // Clear contents cache + value.setLength(0); + } else if ("f".equals(name)) { + // Clear contents cache + formula.setLength(0); + + // Mark us as being a formula if not already + if(nextDataType == xssfDataType.NUMBER) { + nextDataType = xssfDataType.FORMULA; + } + + // Decide where to get the formula string from + String type = attributes.getValue("t"); + if(type != null && type.equals("shared")) { + System.err.println("Warning - shared formulas not yet supported!"); + } else { + fIsOpen = true; + } + } + else if("row".equals(name)) { + firstCellOfRow = true; + } + // c => cell + else if ("c".equals(name)) { + // Set up defaults. + this.nextDataType = xssfDataType.NUMBER; + this.formatIndex = -1; + this.formatString = null; + String cellType = attributes.getValue("t"); + String cellStyleStr = attributes.getValue("s"); + if ("b".equals(cellType)) + nextDataType = xssfDataType.BOOLEAN; + else if ("e".equals(cellType)) + nextDataType = xssfDataType.ERROR; + else if ("inlineStr".equals(cellType)) + nextDataType = xssfDataType.INLINE_STRING; + else if ("s".equals(cellType)) + nextDataType = xssfDataType.SST_STRING; + else if ("str".equals(cellType)) + nextDataType = xssfDataType.FORMULA; + else if (cellStyleStr != null) { + // Number, but almost certainly with a special style or format + int styleIndex = Integer.parseInt(cellStyleStr); + XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); + this.formatIndex = style.getDataFormat(); + this.formatString = style.getDataFormatString(); + if (this.formatString == null) + this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); + } + } + } + + public void endElement(String uri, String localName, String name) + throws SAXException { + String thisStr = null; + + // v => contents of a cell + if ("v".equals(name)) { + vIsOpen = false; + + // Process the value contents as required, now we have it all + switch (nextDataType) { + case BOOLEAN: + char first = value.charAt(0); + thisStr = first == '0' ? "FALSE" : "TRUE"; + break; + + case ERROR: + thisStr = "ERROR:" + value.toString(); + break; + + case FORMULA: + if(formulasNotResults) { + thisStr = formula.toString(); + } else { + thisStr = value.toString(); + } + break; + + case INLINE_STRING: + // TODO: have seen an example of this, so it's untested. + XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); + thisStr = rtsi.toString(); + break; + + case SST_STRING: + String sstIndex = value.toString(); + try { + int idx = Integer.parseInt(sstIndex); + XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); + thisStr = rtss.toString(); + } + catch (NumberFormatException ex) { + System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); + } + break; + + case NUMBER: + String n = value.toString(); + if (this.formatString != null) + thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); + else + thisStr = n; + break; + + default: + thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; + break; + } + + // Output + if(!firstCellOfRow) { + output.append('\t'); + } + firstCellOfRow = false; + + output.append(thisStr); + } else if ("f".equals(name)) { + fIsOpen = false; + } else if ("row".equals(name)) { + // Finish the line + output.append('\n'); + } + } + + /** + * Captures characters only if a suitable element is open. + * Originally was just "v"; extended for inlineStr also. + */ + public void characters(char[] ch, int start, int length) + throws SAXException { + if (vIsOpen) { + value.append(ch, start, length); + } + if (fIsOpen) { + formula.append(ch, start, length); + } + } + } + + /** + * Processes the given sheet + */ + public void processSheet( + StringBuffer output, + StylesTable styles, + ReadOnlySharedStringsTable strings, + InputStream sheetInputStream) + throws IOException, SAXException { + + InputSource sheetSource = new InputSource(sheetInputStream); + SAXParserFactory saxFactory = SAXParserFactory.newInstance(); + try { + SAXParser saxParser = saxFactory.newSAXParser(); + XMLReader sheetParser = saxParser.getXMLReader(); + ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output); + sheetParser.setContentHandler(handler); + sheetParser.parse(sheetSource); + } catch(ParserConfigurationException e) { + throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); + } + } + + /** + * Processes the file and returns the text + */ + public String getText() { + try { + ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container); + XSSFReader xssfReader = new XSSFReader(container); + StylesTable styles = xssfReader.getStylesTable(); + XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); + + StringBuffer text = new StringBuffer(); + while (iter.hasNext()) { + InputStream stream = iter.next(); + if(includeSheetNames) { + text.append(iter.getSheetName()); + text.append('\n'); + } + processSheet(text, styles, strings, stream); + stream.close(); + } + + return text.toString(); + } catch(IOException e) { + System.err.println(e); + return null; + } catch(SAXException se) { + System.err.println(se); + return null; + } catch(OpenXML4JException o4je) { + System.err.println(o4je); + return null; + } + } +} diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java index b4b3c9d78..0f089e2db 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java @@ -56,7 +56,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach public static void main(String[] args) throws Exception { if(args.length < 1) { System.err.println("Use:"); - System.err.println(" HXFExcelExtractor "); + System.err.println(" XSSFExcelExtractor "); System.exit(1); } POIXMLTextExtractor extractor = diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 81f55cc9f..f4f178f22 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -32,6 +32,7 @@ import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; @@ -427,7 +428,7 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())) - instanceof XSSFExcelExtractor // TODO + instanceof XSSFEventBasedExcelExtractor ); assertTrue( ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200 diff --git a/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java b/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java index 166c3b35e..0d52565e7 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java @@ -19,18 +19,14 @@ package org.apache.poi.xssf; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.usermodel.HSSFWorkbook; -import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.apache.poi.util.TempFile; /** * Centralises logic for finding/opening sample files in the src/testcases/org/apache/poi/hssf/hssf/data folder. @@ -39,6 +35,15 @@ import org.apache.poi.util.TempFile; */ public class XSSFTestDataSamples { + public static OPCPackage openSamplePackage(String sampleName) { + try { + return OPCPackage.open( + HSSFTestDataSamples.openSampleFileStream(sampleName) + ); + } catch(Exception e) { + throw new RuntimeException(e); + } + } public static XSSFWorkbook openSampleWorkbook(String sampleName) { InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleName); try { diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java new file mode 100644 index 000000000..c1b024727 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java @@ -0,0 +1,141 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xssf.extractor; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import junit.framework.TestCase; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.hssf.HSSFTestDataSamples; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.xssf.XSSFTestDataSamples; + +/** + * Tests for {@link XSSFEventBasedExcelExtractor} + */ +public final class TestXSSFEventBasedExcelExtractor extends TestCase { + + + private static final XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws Exception { + return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples. + openSamplePackage(sampleName)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + // a very simple file + XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx"); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check sheet names + assertTrue(text.startsWith("Sheet1")); + assertTrue(text.endsWith("Sheet3\n")); + + // Now without, will have text + extractor.setIncludeSheetNames(false); + text = extractor.getText(); + String CHUNK1 = + "Lorem\t111\n" + + "ipsum\t222\n" + + "dolor\t333\n" + + "sit\t444\n" + + "amet\t555\n" + + "consectetuer\t666\n" + + "adipiscing\t777\n" + + "elit\t888\n" + + "Nunc\t999\n"; + String CHUNK2 = + "The quick brown fox jumps over the lazy dog\n" + + "hello, xssf hello, xssf\n" + + "hello, xssf hello, xssf\n" + + "hello, xssf hello, xssf\n" + + "hello, xssf hello, xssf\n"; + assertEquals( + CHUNK1 + + "at\t4995\n" + + CHUNK2 + , text); + + // Now get formulas not their values + extractor.setFormulasNotResults(true); + text = extractor.getText(); + assertEquals( + CHUNK1 + + "at\tSUM(B1:B9)\n" + + CHUNK2, text); + + // With sheet names too + extractor.setIncludeSheetNames(true); + text = extractor.getText(); + assertEquals( + "Sheet1\n" + + CHUNK1 + + "at\tSUM(B1:B9)\n" + + "rich test\n" + + CHUNK2 + + "Sheet3\n" + , text); + } + + public void testGetComplexText() throws Exception { + // A fairly complex file + XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx"); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Might not have all formatting it should do! + assertTrue(text.startsWith( + "Avgtxfull\n" + + "(iii) AVERAGE TAX RATES ON ANNUAL" + )); + } + + /** + * Test that we return pretty much the same as + * ExcelExtractor does, when we're both passed + * the same file, just saved as xls and xlsx + */ + public void testComparedToOLE2() throws Exception { + // A fairly simple file - ooxml + XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx"); + + ExcelExtractor ole2Extractor = + new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls")); + + POITextExtractor[] extractors = + new POITextExtractor[] { ooxmlExtractor, ole2Extractor }; + for (int i = 0; i < extractors.length; i++) { + POITextExtractor extractor = extractors[i]; + + String text = extractor.getText().replaceAll("[\r\t]", ""); + assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n")); + Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL); + Matcher m = pattern.matcher(text); + assertTrue(m.matches()); + } + } +}