Most of support suggested by Phil Varner on the list - ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@902927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7854649016
commit
6e2c32e1c5
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
|
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
|
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>
|
<action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>
|
||||||
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||||||
|
|
||||||
import org.apache.poi.hssf.eventusermodel.HSSFUserException;
|
import org.apache.poi.hssf.eventusermodel.HSSFUserException;
|
||||||
import org.apache.poi.hssf.record.*;
|
import org.apache.poi.hssf.record.*;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -51,11 +52,33 @@ public class HSSFEventFactory {
|
|||||||
* @param fs a POIFS filesystem containing your workbook
|
* @param fs a POIFS filesystem containing your workbook
|
||||||
*/
|
*/
|
||||||
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
|
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
|
||||||
InputStream in = fs.createDocumentInputStream("Workbook");
|
processWorkbookEvents(req, fs.getRoot());
|
||||||
|
|
||||||
processEvents(req, in);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes a file into essentially record events.
|
||||||
|
*
|
||||||
|
* @param req an Instance of HSSFRequest which has your registered listeners
|
||||||
|
* @param fs a POIFS filesystem containing your workbook
|
||||||
|
*/
|
||||||
|
public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException {
|
||||||
|
InputStream in = dir.createDocumentInputStream("Workbook");
|
||||||
|
|
||||||
|
processEvents(req, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes a file into essentially record events.
|
||||||
|
*
|
||||||
|
* @param req an Instance of HSSFRequest which has your registered listeners
|
||||||
|
* @param fs a POIFS filesystem containing your workbook
|
||||||
|
* @return numeric user-specified result code.
|
||||||
|
*/
|
||||||
|
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
|
||||||
|
throws IOException, HSSFUserException {
|
||||||
|
return abortableProcessWorkbookEvents(req, fs.getRoot());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Processes a file into essentially record events.
|
* Processes a file into essentially record events.
|
||||||
*
|
*
|
||||||
@ -63,9 +86,9 @@ public class HSSFEventFactory {
|
|||||||
* @param fs a POIFS filesystem containing your workbook
|
* @param fs a POIFS filesystem containing your workbook
|
||||||
* @return numeric user-specified result code.
|
* @return numeric user-specified result code.
|
||||||
*/
|
*/
|
||||||
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
|
public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir)
|
||||||
throws IOException, HSSFUserException {
|
throws IOException, HSSFUserException {
|
||||||
InputStream in = fs.createDocumentInputStream("Workbook");
|
InputStream in = dir.createDocumentInputStream("Workbook");
|
||||||
return abortableProcessEvents(req, in);
|
return abortableProcessEvents(req, in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ import org.apache.poi.hssf.record.SSTRecord;
|
|||||||
import org.apache.poi.hssf.record.StringRecord;
|
import org.apache.poi.hssf.record.StringRecord;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
|
||||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -65,15 +66,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||||||
* http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
|
* http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
|
||||||
*/
|
*/
|
||||||
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
||||||
|
private DirectoryNode _dir;
|
||||||
private POIFSFileSystem _fs;
|
private POIFSFileSystem _fs;
|
||||||
boolean _includeSheetNames = true;
|
boolean _includeSheetNames = true;
|
||||||
boolean _formulasNotResults = false;
|
boolean _formulasNotResults = false;
|
||||||
|
|
||||||
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
|
||||||
super(null);
|
super(null);
|
||||||
|
_dir = dir;
|
||||||
_fs = fs;
|
_fs = fs;
|
||||||
}
|
}
|
||||||
|
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
||||||
|
this(fs.getRoot(), fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the underlying POIFS FileSystem of
|
||||||
|
* this document.
|
||||||
|
*/
|
||||||
|
public POIFSFileSystem getFileSystem() {
|
||||||
|
return _fs;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Would return the document information metadata for the document,
|
* Would return the document information metadata for the document,
|
||||||
* if we supported it
|
* if we supported it
|
||||||
@ -134,7 +148,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
|||||||
HSSFRequest request = new HSSFRequest();
|
HSSFRequest request = new HSSFRequest();
|
||||||
request.addListenerForAllRecords(ft);
|
request.addListenerForAllRecords(ft);
|
||||||
|
|
||||||
factory.processWorkbookEvents(request, _fs);
|
factory.processWorkbookEvents(request, _dir);
|
||||||
|
|
||||||
return tl;
|
return tl;
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
|||||||
import org.apache.poi.hsmf.MAPIMessage;
|
import org.apache.poi.hsmf.MAPIMessage;
|
||||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||||
@ -63,6 +64,59 @@ public class ExtractorFactory {
|
|||||||
public static final String CORE_DOCUMENT_REL =
|
public static final String CORE_DOCUMENT_REL =
|
||||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
||||||
|
|
||||||
|
|
||||||
|
/** Should this thread prefer event based over usermodel based extractors? */
|
||||||
|
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||||
|
protected Boolean initialValue() { return Boolean.FALSE; }
|
||||||
|
};
|
||||||
|
/** Should all threads prefer event based over usermodel based extractors? */
|
||||||
|
private static Boolean allPreferEventExtractors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread prefer event based over usermodel based extractors?
|
||||||
|
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||||
|
* Default is false.
|
||||||
|
*/
|
||||||
|
public static boolean getThreadPrefersEventExtractors() {
|
||||||
|
return threadPreferEventExtractors.get();
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Should all threads prefer event based over usermodel based extractors?
|
||||||
|
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||||
|
* Default is to use the thread level setting, which defaults to false.
|
||||||
|
*/
|
||||||
|
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||||
|
return allPreferEventExtractors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread prefer event based over usermodel based extractors?
|
||||||
|
* Will only be used if the All Threads setting is null.
|
||||||
|
*/
|
||||||
|
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||||
|
threadPreferEventExtractors.set(preferEventExtractors);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Should all threads prefer event based over usermodel based extractors?
|
||||||
|
* If set, will take preference over the Thread level setting.
|
||||||
|
*/
|
||||||
|
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||||
|
allPreferEventExtractors = preferEventExtractors;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread use event based extractors is available?
|
||||||
|
* Checks the all-threads one first, then thread specific.
|
||||||
|
*/
|
||||||
|
protected static boolean getPreferEventExtractor() {
|
||||||
|
if(allPreferEventExtractors != null) {
|
||||||
|
return allPreferEventExtractors;
|
||||||
|
}
|
||||||
|
return threadPreferEventExtractors.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
InputStream inp = new PushbackInputStream(
|
InputStream inp = new PushbackInputStream(
|
||||||
new FileInputStream(f), 8);
|
new FileInputStream(f), 8);
|
||||||
@ -106,7 +160,12 @@ public class ExtractorFactory {
|
|||||||
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
|
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
|
||||||
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
|
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
|
||||||
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
|
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
|
||||||
return new XSSFExcelExtractor(pkg);
|
if(getPreferEventExtractor()) {
|
||||||
|
// TODO
|
||||||
|
return new XSSFExcelExtractor(pkg);
|
||||||
|
} else {
|
||||||
|
return new XSSFExcelExtractor(pkg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
|
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
|
||||||
@ -148,7 +207,11 @@ public class ExtractorFactory {
|
|||||||
Entry entry = entries.next();
|
Entry entry = entries.next();
|
||||||
|
|
||||||
if(entry.getName().equals("Workbook")) {
|
if(entry.getName().equals("Workbook")) {
|
||||||
return new ExcelExtractor(poifsDir, fs);
|
if(getPreferEventExtractor()) {
|
||||||
|
return new EventBasedExcelExtractor(poifsDir, fs);
|
||||||
|
} else {
|
||||||
|
return new ExcelExtractor(poifsDir, fs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if(entry.getName().equals("WordDocument")) {
|
if(entry.getName().equals("WordDocument")) {
|
||||||
return new WordExtractor(poifsDir, fs);
|
return new WordExtractor(poifsDir, fs);
|
||||||
|
@ -27,6 +27,7 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
|||||||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
@ -390,6 +391,72 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
// Good
|
// Good
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPreferEventBased() throws Exception {
|
||||||
|
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||||
|
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||||
|
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||||
|
|
||||||
|
ExtractorFactory.setThreadPrefersEventExtractors(true);
|
||||||
|
|
||||||
|
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
|
||||||
|
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||||
|
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||||
|
|
||||||
|
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
|
||||||
|
|
||||||
|
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||||
|
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||||
|
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||||
|
|
||||||
|
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
|
||||||
|
|
||||||
|
assertEquals(true, ExtractorFactory.getPreferEventExtractor());
|
||||||
|
assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||||
|
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||||
|
|
||||||
|
|
||||||
|
// Check we get the right extractors now
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
||||||
|
instanceof EventBasedExcelExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
|
||||||
|
instanceof XSSFExcelExtractor // TODO
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Put back to normal
|
||||||
|
ExtractorFactory.setThreadPrefersEventExtractors(false);
|
||||||
|
assertEquals(false, ExtractorFactory.getPreferEventExtractor());
|
||||||
|
assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
|
||||||
|
assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||||
|
|
||||||
|
// And back
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
||||||
|
instanceof ExcelExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
|
||||||
|
instanceof XSSFExcelExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test embeded docs text extraction. For now, only
|
* Test embeded docs text extraction. For now, only
|
||||||
|
Loading…
Reference in New Issue
Block a user