Most of support suggested by Phil Varner on the list - ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@902927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7854649016
commit
6e2c32e1c5
@ -34,6 +34,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>
|
||||
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.poi.hssf.eventusermodel.HSSFUserException;
|
||||
import org.apache.poi.hssf.record.*;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
@ -51,7 +52,17 @@ public class HSSFEventFactory {
|
||||
* @param fs a POIFS filesystem containing your workbook
|
||||
*/
|
||||
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
|
||||
InputStream in = fs.createDocumentInputStream("Workbook");
|
||||
processWorkbookEvents(req, fs.getRoot());
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a file into essentially record events.
|
||||
*
|
||||
* @param req an Instance of HSSFRequest which has your registered listeners
|
||||
* @param fs a POIFS filesystem containing your workbook
|
||||
*/
|
||||
public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException {
|
||||
InputStream in = dir.createDocumentInputStream("Workbook");
|
||||
|
||||
processEvents(req, in);
|
||||
}
|
||||
@ -65,7 +76,19 @@ public class HSSFEventFactory {
|
||||
*/
|
||||
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
|
||||
throws IOException, HSSFUserException {
|
||||
InputStream in = fs.createDocumentInputStream("Workbook");
|
||||
return abortableProcessWorkbookEvents(req, fs.getRoot());
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a file into essentially record events.
|
||||
*
|
||||
* @param req an Instance of HSSFRequest which has your registered listeners
|
||||
* @param fs a POIFS filesystem containing your workbook
|
||||
* @return numeric user-specified result code.
|
||||
*/
|
||||
public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir)
|
||||
throws IOException, HSSFUserException {
|
||||
InputStream in = dir.createDocumentInputStream("Workbook");
|
||||
return abortableProcessEvents(req, in);
|
||||
}
|
||||
|
||||
|
@ -46,6 +46,7 @@ import org.apache.poi.hssf.record.SSTRecord;
|
||||
import org.apache.poi.hssf.record.StringRecord;
|
||||
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
@ -65,14 +66,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
* http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
|
||||
*/
|
||||
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
||||
private DirectoryNode _dir;
|
||||
private POIFSFileSystem _fs;
|
||||
boolean _includeSheetNames = true;
|
||||
boolean _formulasNotResults = false;
|
||||
|
||||
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
||||
public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
|
||||
super(null);
|
||||
_dir = dir;
|
||||
_fs = fs;
|
||||
}
|
||||
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the underlying POIFS FileSystem of
|
||||
* this document.
|
||||
*/
|
||||
public POIFSFileSystem getFileSystem() {
|
||||
return _fs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Would return the document information metadata for the document,
|
||||
@ -134,7 +148,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
||||
HSSFRequest request = new HSSFRequest();
|
||||
request.addListenerForAllRecords(ft);
|
||||
|
||||
factory.processWorkbookEvents(request, _fs);
|
||||
factory.processWorkbookEvents(request, _dir);
|
||||
|
||||
return tl;
|
||||
}
|
||||
|
@ -36,6 +36,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
@ -63,6 +64,59 @@ public class ExtractorFactory {
|
||||
public static final String CORE_DOCUMENT_REL =
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
||||
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||
protected Boolean initialValue() { return Boolean.FALSE; }
|
||||
};
|
||||
/** Should all threads prefer event based over usermodel based extractors? */
|
||||
private static Boolean allPreferEventExtractors;
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
threadPreferEventExtractors.set(preferEventExtractors);
|
||||
}
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
allPreferEventExtractors = preferEventExtractors;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*/
|
||||
protected static boolean getPreferEventExtractor() {
|
||||
if(allPreferEventExtractors != null) {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
|
||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
InputStream inp = new PushbackInputStream(
|
||||
new FileInputStream(f), 8);
|
||||
@ -106,7 +160,12 @@ public class ExtractorFactory {
|
||||
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
|
||||
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
|
||||
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
|
||||
if(getPreferEventExtractor()) {
|
||||
// TODO
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
} else {
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
|
||||
@ -148,8 +207,12 @@ public class ExtractorFactory {
|
||||
Entry entry = entries.next();
|
||||
|
||||
if(entry.getName().equals("Workbook")) {
|
||||
if(getPreferEventExtractor()) {
|
||||
return new EventBasedExcelExtractor(poifsDir, fs);
|
||||
} else {
|
||||
return new ExcelExtractor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
if(entry.getName().equals("WordDocument")) {
|
||||
return new WordExtractor(poifsDir, fs);
|
||||
}
|
||||
|
@ -27,6 +27,7 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
@ -391,6 +392,72 @@ public class TestExtractorFactory extends TestCase {
|
||||
}
|
||||
}
|
||||
|
||||
public void testPreferEventBased() throws Exception {
|
||||
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
ExtractorFactory.setThreadPrefersEventExtractors(true);
|
||||
|
||||
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
|
||||
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
|
||||
|
||||
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
|
||||
|
||||
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
|
||||
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
|
||||
// Check we get the right extractors now
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
||||
instanceof EventBasedExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
|
||||
instanceof XSSFExcelExtractor // TODO
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
|
||||
);
|
||||
|
||||
|
||||
// Put back to normal
|
||||
ExtractorFactory.setThreadPrefersEventExtractors(false);
|
||||
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
// And back
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test embeded docs text extraction. For now, only
|
||||
* does poifs embeded, but will do ooxml ones
|
||||
|
Loading…
Reference in New Issue
Block a user