diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index ddd44109b..7842c6136 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis 48544 - avoid failures in XLSX2CSV when shared string table is missing 48571 - properly close all IO streams created in OPCPackage 48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas diff --git a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java index a11aee96e..ede5612cb 100644 --- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java +++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.poi.hssf.eventusermodel.HSSFUserException; import org.apache.poi.hssf.record.*; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -51,11 +52,33 @@ public class HSSFEventFactory { * @param fs a POIFS filesystem containing your workbook */ public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException { - InputStream in = fs.createDocumentInputStream("Workbook"); - - processEvents(req, in); + processWorkbookEvents(req, fs.getRoot()); } + /** + * Processes a file into essentially record events. + * + * @param req an Instance of HSSFRequest which has your registered listeners + * @param fs a POIFS filesystem containing your workbook + */ + public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException { + InputStream in = dir.createDocumentInputStream("Workbook"); + + processEvents(req, in); + } + + /** + * Processes a file into essentially record events. + * + * @param req an Instance of HSSFRequest which has your registered listeners + * @param fs a POIFS filesystem containing your workbook + * @return numeric user-specified result code. + */ + public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) + throws IOException, HSSFUserException { + return abortableProcessWorkbookEvents(req, fs.getRoot()); + } + /** * Processes a file into essentially record events. * @@ -63,9 +86,9 @@ public class HSSFEventFactory { * @param fs a POIFS filesystem containing your workbook * @return numeric user-specified result code. */ - public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) + public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException, HSSFUserException { - InputStream in = fs.createDocumentInputStream("Workbook"); + InputStream in = dir.createDocumentInputStream("Workbook"); return abortableProcessEvents(req, in); } diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java index 7b28ed55e..15560e19c 100644 --- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java @@ -46,6 +46,7 @@ import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -65,15 +66,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java */ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { + private DirectoryNode _dir; private POIFSFileSystem _fs; boolean _includeSheetNames = true; boolean _formulasNotResults = false; - public EventBasedExcelExtractor(POIFSFileSystem fs) { + public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) { super(null); + _dir = dir; _fs = fs; } + public EventBasedExcelExtractor(POIFSFileSystem fs) { + this(fs.getRoot(), fs); + } + /** + * Return the underlying POIFS FileSystem of + * this document. + */ + public POIFSFileSystem getFileSystem() { + return _fs; + } + /** * Would return the document information metadata for the document, * if we supported it @@ -134,7 +148,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { HSSFRequest request = new HSSFRequest(); request.addListenerForAllRecords(ft); - factory.processWorkbookEvents(request, _fs); + factory.processWorkbookEvents(request, _dir); return tl; } diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 24a2632be..ed7f22ac8 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -36,6 +36,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; +import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; @@ -63,6 +64,59 @@ public class ExtractorFactory { public static final String CORE_DOCUMENT_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; + + /** Should this thread prefer event based over usermodel based extractors? */ + private static final ThreadLocal threadPreferEventExtractors = new ThreadLocal() { + protected Boolean initialValue() { return Boolean.FALSE; } + }; + /** Should all threads prefer event based over usermodel based extractors? */ + private static Boolean allPreferEventExtractors; + + /** + * Should this thread prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. + */ + public static boolean getThreadPrefersEventExtractors() { + return threadPreferEventExtractors.get(); + } + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. + */ + public static Boolean getAllThreadsPreferEventExtractors() { + return allPreferEventExtractors; + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * Will only be used if the All Threads setting is null. + */ + public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { + threadPreferEventExtractors.set(preferEventExtractors); + } + /** + * Should all threads prefer event based over usermodel based extractors? + * If set, will take preference over the Thread level setting. + */ + public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { + allPreferEventExtractors = preferEventExtractors; + } + + + /** + * Should this thread use event based extractors is available? + * Checks the all-threads one first, then thread specific. + */ + protected static boolean getPreferEventExtractor() { + if(allPreferEventExtractors != null) { + return allPreferEventExtractors; + } + return threadPreferEventExtractors.get(); + } + + public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { InputStream inp = new PushbackInputStream( new FileInputStream(f), 8); @@ -106,7 +160,12 @@ public class ExtractorFactory { corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) || corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) || corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) { - return new XSSFExcelExtractor(pkg); + if(getPreferEventExtractor()) { + // TODO + return new XSSFExcelExtractor(pkg); + } else { + return new XSSFExcelExtractor(pkg); + } } if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) || @@ -148,7 +207,11 @@ public class ExtractorFactory { Entry entry = entries.next(); if(entry.getName().equals("Workbook")) { - return new ExcelExtractor(poifsDir, fs); + if(getPreferEventExtractor()) { + return new EventBasedExcelExtractor(poifsDir, fs); + } else { + return new ExcelExtractor(poifsDir, fs); + } } if(entry.getName().equals("WordDocument")) { return new WordExtractor(poifsDir, fs); diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index c127427a2..81f55cc9f 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -27,6 +27,7 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; +import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -390,6 +391,72 @@ public class TestExtractorFactory extends TestCase { // Good } } + + public void testPreferEventBased() throws Exception { + assertEquals(false, ExtractorFactory.getPreferEventExtractor()); + assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setThreadPrefersEventExtractors(true); + + assertEquals(true, ExtractorFactory.getPreferEventExtractor()); + assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setAllThreadsPreferEventExtractors(false); + + assertEquals(false, ExtractorFactory.getPreferEventExtractor()); + assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + ExtractorFactory.setAllThreadsPreferEventExtractors(null); + + assertEquals(true, ExtractorFactory.getPreferEventExtractor()); + assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + + // Check we get the right extractors now + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) + instanceof EventBasedExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())) + instanceof XSSFExcelExtractor // TODO + ); + assertTrue( + ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200 + ); + + + // Put back to normal + ExtractorFactory.setThreadPrefersEventExtractors(false); + assertEquals(false, ExtractorFactory.getPreferEventExtractor()); + assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors()); + assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors()); + + // And back + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200 + ); + } /** * Test embeded docs text extraction. For now, only