Most of support suggested by Phil Varner on the list - ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@902927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-25 19:02:13 +00:00
parent 7854649016
commit 6e2c32e1c5
5 changed files with 177 additions and 9 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.7-SNAPSHOT" date="2010-??-??"> <release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action> <action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action> <action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
<action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action> <action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.poi.hssf.eventusermodel.HSSFUserException; import org.apache.poi.hssf.eventusermodel.HSSFUserException;
import org.apache.poi.hssf.record.*; import org.apache.poi.hssf.record.*;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
@ -51,11 +52,33 @@ public class HSSFEventFactory {
* @param fs a POIFS filesystem containing your workbook * @param fs a POIFS filesystem containing your workbook
*/ */
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException { public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
InputStream in = fs.createDocumentInputStream("Workbook"); processWorkbookEvents(req, fs.getRoot());
processEvents(req, in);
} }
/**
* Processes a file into essentially record events.
*
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
*/
public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException {
InputStream in = dir.createDocumentInputStream("Workbook");
processEvents(req, in);
}
/**
* Processes a file into essentially record events.
*
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
* @return numeric user-specified result code.
*/
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
throws IOException, HSSFUserException {
return abortableProcessWorkbookEvents(req, fs.getRoot());
}
/** /**
* Processes a file into essentially record events. * Processes a file into essentially record events.
* *
@ -63,9 +86,9 @@ public class HSSFEventFactory {
* @param fs a POIFS filesystem containing your workbook * @param fs a POIFS filesystem containing your workbook
* @return numeric user-specified result code. * @return numeric user-specified result code.
*/ */
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir)
throws IOException, HSSFUserException { throws IOException, HSSFUserException {
InputStream in = fs.createDocumentInputStream("Workbook"); InputStream in = dir.createDocumentInputStream("Workbook");
return abortableProcessEvents(req, in); return abortableProcessEvents(req, in);
} }

View File

@ -46,6 +46,7 @@ import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
@ -65,15 +66,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link> * http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
*/ */
public class EventBasedExcelExtractor extends POIOLE2TextExtractor { public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
private DirectoryNode _dir;
private POIFSFileSystem _fs; private POIFSFileSystem _fs;
boolean _includeSheetNames = true; boolean _includeSheetNames = true;
boolean _formulasNotResults = false; boolean _formulasNotResults = false;
public EventBasedExcelExtractor(POIFSFileSystem fs) { public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
super(null); super(null);
_dir = dir;
_fs = fs; _fs = fs;
} }
public EventBasedExcelExtractor(POIFSFileSystem fs) {
this(fs.getRoot(), fs);
}
/**
* Return the underlying POIFS FileSystem of
* this document.
*/
public POIFSFileSystem getFileSystem() {
return _fs;
}
/** /**
* Would return the document information metadata for the document, * Would return the document information metadata for the document,
* if we supported it * if we supported it
@ -134,7 +148,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
HSSFRequest request = new HSSFRequest(); HSSFRequest request = new HSSFRequest();
request.addListenerForAllRecords(ft); request.addListenerForAllRecords(ft);
factory.processWorkbookEvents(request, _fs); factory.processWorkbookEvents(request, _dir);
return tl; return tl;
} }

View File

@ -36,6 +36,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@ -63,6 +64,59 @@ public class ExtractorFactory {
public static final String CORE_DOCUMENT_REL = public static final String CORE_DOCUMENT_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
protected Boolean initialValue() { return Boolean.FALSE; }
};
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors;
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors;
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*/
protected static boolean getPreferEventExtractor() {
if(allPreferEventExtractors != null) {
return allPreferEventExtractors;
}
return threadPreferEventExtractors.get();
}
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
InputStream inp = new PushbackInputStream( InputStream inp = new PushbackInputStream(
new FileInputStream(f), 8); new FileInputStream(f), 8);
@ -106,7 +160,12 @@ public class ExtractorFactory {
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) || corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) || corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) { corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
return new XSSFExcelExtractor(pkg); if(getPreferEventExtractor()) {
// TODO
return new XSSFExcelExtractor(pkg);
} else {
return new XSSFExcelExtractor(pkg);
}
} }
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) || if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
@ -148,7 +207,11 @@ public class ExtractorFactory {
Entry entry = entries.next(); Entry entry = entries.next();
if(entry.getName().equals("Workbook")) { if(entry.getName().equals("Workbook")) {
return new ExcelExtractor(poifsDir, fs); if(getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir, fs);
} else {
return new ExcelExtractor(poifsDir, fs);
}
} }
if(entry.getName().equals("WordDocument")) { if(entry.getName().equals("WordDocument")) {
return new WordExtractor(poifsDir, fs); return new WordExtractor(poifsDir, fs);

View File

@ -27,6 +27,7 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -390,6 +391,72 @@ public class TestExtractorFactory extends TestCase {
// Good // Good
} }
} }
public void testPreferEventBased() throws Exception {
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setThreadPrefersEventExtractors(true);
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
// Check we get the right extractors now
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
instanceof EventBasedExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
instanceof XSSFExcelExtractor // TODO
);
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
);
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
instanceof XSSFExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
);
}
/** /**
* Test embeded docs text extraction. For now, only * Test embeded docs text extraction. For now, only