diff --git a/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java b/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java index e15b4ec0b..8e0007edf 100644 --- a/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java +++ b/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java @@ -23,32 +23,28 @@ import java.io.InputStream; import java.io.PrintStream; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; -import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.util.CellReference; +import org.apache.poi.util.SAXHelper; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.model.StylesTable; -import org.apache.poi.xssf.usermodel.XSSFCellStyle; -import org.apache.poi.xssf.usermodel.XSSFRichTextString; -import org.xml.sax.Attributes; +import org.apache.poi.xssf.usermodel.XSSFComment; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; /** * A rudimentary XLSX -> CSV processor modeled on the - * POI sample program XLS2CSVmra by Nick Burch from the - * package org.apache.poi.hssf.eventusermodel.examples. - * Unlike the HSSF version, this one completely ignores - * missing rows. + * POI sample program XLS2CSVmra from the package + * org.apache.poi.hssf.eventusermodel.examples. + * As with the HSSF version, this tries to spot missing + * rows and cells, and output empty entries for them. *

* Data sheets are read using a SAX parser to keep the * memory footprint relatively small, so this should be @@ -61,263 +57,81 @@ import org.xml.sax.helpers.DefaultHandler; *

* For a more advanced implementation of SAX event parsing * of XLSX files, see {@link XSSFEventBasedExcelExtractor} - * and {@link XSSFSheetXMLHandler}. Note that some use cases, + * and {@link XSSFSheetXMLHandler}. Note that for many cases, * it may be possible to simply use those with a custom * {@link SheetContentsHandler} and no SAX code needed of * your own! */ public class XLSX2CSV { - /** - * The type of the data value is indicated by an attribute on the cell. - * The value is usually in a "v" element within the cell. + * Uses the XSSF Event SAX helpers to do most of the work + * of parsing the Sheet XML, and outputs the contents + * as a (basic) CSV. */ - enum xssfDataType { - BOOL, - ERROR, - FORMULA, - INLINESTR, - SSTINDEX, - NUMBER, + private class SheetToCSV implements SheetContentsHandler { + private boolean firstCellOfRow = false; + private int currentRow = -1; + private int currentCol = -1; + + private void outputMissingRows(int number) { + for (int i=0; i - * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at - * http://www.ecma-international.org/publications/standards/Ecma-376.htm - *

- * A web-friendly version is http://openiso.org/Ecma/376/Part4 - */ - class MyXSSFSheetHandler extends DefaultHandler { - - /** - * Table with styles - */ - private StylesTable stylesTable; - - /** - * Table with unique strings - */ - private ReadOnlySharedStringsTable sharedStringsTable; - - // Set when V start element is seen - private boolean vIsOpen; - - // Set when cell start element is seen; - // used when cell close element is seen. - private xssfDataType nextDataType; - - // Used to format numeric cell values. - private short formatIndex; - private String formatString; - private final DataFormatter formatter; - - private int thisColumn = -1; - // The last column printed to the output stream - private int lastColumnNumber = -1; - - // Gathers characters as they are seen. - private StringBuffer value; - - /** - * Accepts objects needed while parsing. - * - * @param styles Table of styles - * @param strings Table of shared strings - * @param cols Minimum number of columns to show - * @param target Sink for output - */ - public MyXSSFSheetHandler( - StylesTable styles, - ReadOnlySharedStringsTable strings) { - this.stylesTable = styles; - this.sharedStringsTable = strings; - this.value = new StringBuffer(); - this.nextDataType = xssfDataType.NUMBER; - this.formatter = new DataFormatter(); - } - - /* - * (non-Javadoc) - * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) - */ - public void startElement(String uri, String localName, String name, - Attributes attributes) throws SAXException { - - if ("inlineStr".equals(name) || "v".equals(name)) { - vIsOpen = true; - // Clear contents cache - value.setLength(0); - } - // c => cell - else if ("c".equals(name)) { - // Get the cell reference - String r = attributes.getValue("r"); - int firstDigit = -1; - for (int c = 0; c < r.length(); ++c) { - if (Character.isDigit(r.charAt(c))) { - firstDigit = c; - break; - } - } - thisColumn = nameToColumn(r.substring(0, firstDigit)); - - // Set up defaults. - this.nextDataType = xssfDataType.NUMBER; - this.formatIndex = -1; - this.formatString = null; - String cellType = attributes.getValue("t"); - String cellStyleStr = attributes.getValue("s"); - if ("b".equals(cellType)) { - nextDataType = xssfDataType.BOOL; - } else if ("e".equals(cellType)) { - nextDataType = xssfDataType.ERROR; - } else if ("inlineStr".equals(cellType)) { - nextDataType = xssfDataType.INLINESTR; - } else if ("s".equals(cellType)) { - nextDataType = xssfDataType.SSTINDEX; - } else if ("str".equals(cellType)) { - nextDataType = xssfDataType.FORMULA; - } else if (cellStyleStr != null) { - // It's a number, but almost certainly one - // with a special style or format - int styleIndex = Integer.parseInt(cellStyleStr); - XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); - this.formatIndex = style.getDataFormat(); - this.formatString = style.getDataFormatString(); - if (this.formatString == null) { - this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); - } - } - } - - } - - /* - * (non-Javadoc) - * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String) - */ - public void endElement(String uri, String localName, String name) - throws SAXException { - - String thisStr = null; - - // v => contents of a cell - if ("v".equals(name)) { - // Process the value contents as required. - // Do now, as characters() may be called more than once - switch (nextDataType) { - - case BOOL: - char first = value.charAt(0); - thisStr = first == '0' ? "FALSE" : "TRUE"; - break; - - case ERROR: - thisStr = "\"ERROR:" + value.toString() + '"'; - break; - - case FORMULA: - // A formula could result in a string value, - // so always add double-quote characters. - thisStr = '"' + value.toString() + '"'; - break; - - case INLINESTR: - // TODO: have seen an example of this, so it's untested. - XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); - thisStr = '"' + rtsi.toString() + '"'; - break; - - case SSTINDEX: - String sstIndex = value.toString(); - try { - int idx = Integer.parseInt(sstIndex); - XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); - thisStr = '"' + rtss.toString() + '"'; - } - catch (NumberFormatException ex) { - output.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); - } - break; - - case NUMBER: - String n = value.toString(); - if (this.formatString != null && n.length() > 0) - thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); - else - thisStr = n; - break; - - default: - thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; - break; - } - - // Output after we've seen the string contents - // Emit commas for any fields that were missing on this row - if (lastColumnNumber == -1) { - lastColumnNumber = 0; - } - for (int i = lastColumnNumber; i < thisColumn; ++i) - output.print(','); - - // Might be the empty string. - output.print(thisStr); - - // Update column - if (thisColumn > -1) - lastColumnNumber = thisColumn; - - } else if ("row".equals(name)) { - - // Print out any missing commas if needed - if (minColumns > 0) { - // Columns are 0 based - if (lastColumnNumber == -1) { - lastColumnNumber = 0; - } - for (int i = lastColumnNumber; i < minColumns; i++) { - output.print(','); - } - } - - // We're onto a new row - output.println(); - lastColumnNumber = -1; - } - - } - - /** - * Captures characters only if a suitable element is open. - * Originally was just "v"; extended for inlineStr also. - */ - public void characters(char[] ch, int start, int length) - throws SAXException { - if (vIsOpen) - value.append(ch, start, length); - } - - /** - * Converts an Excel column name like "C" to a zero-based index. - * - * @param name - * @return Index corresponding to the specified name - */ - private int nameToColumn(String name) { - int column = -1; - for (int i = 0; i < name.length(); ++i) { - int c = name.charAt(i); - column = (column + 1) * 26 + c - 'A'; - } - return column; - } - - } - /////////////////////////////////////// private final OPCPackage xlsxPackage; @@ -356,16 +170,20 @@ public class XLSX2CSV { public void processSheet( StylesTable styles, ReadOnlySharedStringsTable strings, + SheetContentsHandler sheetHandler, InputStream sheetInputStream) throws IOException, ParserConfigurationException, SAXException { - + DataFormatter formatter = new DataFormatter(); InputSource sheetSource = new InputSource(sheetInputStream); - SAXParserFactory saxFactory = SAXParserFactory.newInstance(); - SAXParser saxParser = saxFactory.newSAXParser(); - XMLReader sheetParser = saxParser.getXMLReader(); - ContentHandler handler = new MyXSSFSheetHandler(styles, strings); - sheetParser.setContentHandler(handler); - sheetParser.parse(sheetSource); + try { + XMLReader sheetParser = SAXHelper.newXMLReader(); + ContentHandler handler = new XSSFSheetXMLHandler( + styles, null, strings, sheetHandler, formatter, false); + sheetParser.setContentHandler(handler); + sheetParser.parse(sheetSource); + } catch(ParserConfigurationException e) { + throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); + } } /** @@ -378,7 +196,6 @@ public class XLSX2CSV { */ public void process() throws IOException, OpenXML4JException, ParserConfigurationException, SAXException { - ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage); XSSFReader xssfReader = new XSSFReader(this.xlsxPackage); StylesTable styles = xssfReader.getStylesTable(); @@ -389,7 +206,7 @@ public class XLSX2CSV { String sheetName = iter.getSheetName(); this.output.println(); this.output.println(sheetName + " [index=" + index + "]:"); - processSheet(styles, strings, stream); + processSheet(styles, strings, new SheetToCSV(), stream); stream.close(); ++index; } @@ -418,5 +235,4 @@ public class XLSX2CSV { xlsx2csv.process(); p.close(); } - } diff --git a/src/examples/src/org/apache/poi/xssf/eventusermodel/examples/FromHowTo.java b/src/examples/src/org/apache/poi/xssf/eventusermodel/examples/FromHowTo.java index 4df63448e..cd102144e 100644 --- a/src/examples/src/org/apache/poi/xssf/eventusermodel/examples/FromHowTo.java +++ b/src/examples/src/org/apache/poi/xssf/eventusermodel/examples/FromHowTo.java @@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel.examples; import java.io.InputStream; import java.util.Iterator; +import org.apache.poi.xssf.eventusermodel.XLSX2CSV; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.usermodel.XSSFRichTextString; @@ -32,18 +33,21 @@ import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; /** - * XSSF and SAX (Event API) + * XSSF and SAX (Event API) basic example. + * See {@link XLSX2CSV} for a fuller example of doing + * XSLX processing with the XSSF Event code. */ public class FromHowTo { - public void processOneSheet(String filename) throws Exception { + public void processFirstSheet(String filename) throws Exception { OPCPackage pkg = OPCPackage.open(filename); XSSFReader r = new XSSFReader( pkg ); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); - // rId2 found by processing the Workbook - // Seems to either be rId# or rSheet# + // To look up the Sheet Name / Sheet Order / rID, + // you need to process the core Workbook stream. + // Normally it's of the form rId# or rSheet# InputStream sheet2 = r.getSheet("rId2"); InputSource sheetSource = new InputSource(sheet2); parser.parse(sheetSource); @@ -133,7 +137,7 @@ public class FromHowTo { public static void main(String[] args) throws Exception { FromHowTo howto = new FromHowTo(); - howto.processOneSheet(args[0]); + howto.processFirstSheet(args[0]); howto.processAllSheets(args[0]); } }