Start moving logic over into the main and scratchpad jars for OLE2
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752226 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a5fb04f45f
commit
7179c813e6
@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Figures out the correct POIOLE2TextExtractor for your supplied
|
||||
@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class OLE2ExtractorFactory {
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||
@Override
|
||||
@ -115,10 +121,37 @@ public class OLE2ExtractorFactory {
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream input) {
|
||||
// TODO Something nasty with reflection...
|
||||
public static POITextExtractor createExtractor(InputStream input) throws IOException {
|
||||
Class<?> cls = getOOXMLClass();
|
||||
if (cls != null) {
|
||||
// TODO Reflection
|
||||
throw new IllegalArgumentException("TODO Reflection");
|
||||
} else {
|
||||
// Best hope it's OLE2....
|
||||
return createExtractor(new NPOIFSFileSystem(input));
|
||||
}
|
||||
}
|
||||
|
||||
private static Class<?> getOOXMLClass() {
|
||||
try {
|
||||
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||
"org.apache.poi.extractor.ExtractorFactory"
|
||||
);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
private static Class<?> getScratchpadClass() {
|
||||
try {
|
||||
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||
"org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
|
||||
);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
|
||||
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
|
||||
@ -139,7 +172,15 @@ public class OLE2ExtractorFactory {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO Try to ask the Scratchpad
|
||||
// Ask Scratchpad, or fail trying
|
||||
Class<?> cls = getScratchpadClass();
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
|
||||
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
|
||||
if (ext != null) return ext;
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
@ -155,9 +196,9 @@ public class OLE2ExtractorFactory {
|
||||
throws IOException
|
||||
{
|
||||
// All the embedded directories we spotted
|
||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||
List<Entry> dirs = new ArrayList<Entry>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
@ -175,7 +216,15 @@ public class OLE2ExtractorFactory {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO Ask scratchpad
|
||||
// Ask Scratchpad, or fail trying
|
||||
Class<?> cls = getScratchpadClass();
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod(
|
||||
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
|
||||
m.invoke(null, ext, dirs, nonPOIFS);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
@ -195,10 +244,10 @@ public class OLE2ExtractorFactory {
|
||||
} catch (IllegalArgumentException ie) {
|
||||
// Ignore, just means it didn't contain
|
||||
// a format we support as yet
|
||||
// TODO Should we log this?
|
||||
LOGGER.log(POILogger.WARN, ie);
|
||||
} catch (Exception xe) {
|
||||
// Ignore, invalid format
|
||||
// TODO Should we log this?
|
||||
LOGGER.log(POILogger.WARN, xe);
|
||||
}
|
||||
}
|
||||
return e.toArray(new POITextExtractor[e.size()]);
|
||||
|
@ -78,23 +78,13 @@ public class ExtractorFactory {
|
||||
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
|
||||
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
|
||||
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||
@Override
|
||||
protected Boolean initialValue() { return Boolean.FALSE; }
|
||||
};
|
||||
|
||||
/** Should all threads prefer event based over usermodel based extractors? */
|
||||
private static Boolean allPreferEventExtractors;
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return threadPreferEventExtractors.get();
|
||||
return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -103,7 +93,7 @@ public class ExtractorFactory {
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return allPreferEventExtractors;
|
||||
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -111,7 +101,7 @@ public class ExtractorFactory {
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
threadPreferEventExtractors.set(preferEventExtractors);
|
||||
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -119,7 +109,7 @@ public class ExtractorFactory {
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
allPreferEventExtractors = preferEventExtractors;
|
||||
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -127,10 +117,7 @@ public class ExtractorFactory {
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*/
|
||||
protected static boolean getPreferEventExtractor() {
|
||||
if(allPreferEventExtractors != null) {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
return threadPreferEventExtractors.get();
|
||||
return OLE2ExtractorFactory.getPreferEventExtractor();
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
@ -281,83 +268,28 @@ public class ExtractorFactory {
|
||||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
return OLE2ExtractorFactory.createExtractor(fs);
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
|
||||
OpenXML4JException, XmlException
|
||||
{
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
|
||||
if (poifsDir.hasEntry(workbookName)) {
|
||||
if (getPreferEventExtractor()) {
|
||||
return new EventBasedExcelExtractor(poifsDir);
|
||||
}
|
||||
return new ExcelExtractor(poifsDir);
|
||||
}
|
||||
}
|
||||
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
|
||||
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
|
||||
+ "found. Please call OldExcelExtractor directly for basic text extraction");
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("WordDocument")) {
|
||||
// Old or new style word document?
|
||||
try {
|
||||
return new WordExtractor(poifsDir);
|
||||
} catch (OldWordFileFormatException e) {
|
||||
return new Word6Extractor(poifsDir);
|
||||
}
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("PowerPoint Document")) {
|
||||
return new PowerPointExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("Quill")) {
|
||||
return new PublisherTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
final String[] outlookEntryNames = new String[] {
|
||||
// message bodies, saved as plain text (PtypString)
|
||||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
|
||||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
|
||||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
|
||||
// @see org.apache.poi.hsmf.Types.MAPIType
|
||||
"__substg1.0_1000001E", //PidTagBody ASCII
|
||||
"__substg1.0_1000001F", //PidTagBody Unicode
|
||||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
|
||||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
|
||||
"__substg1.0_0037001E", //PidTagSubject ASCII
|
||||
"__substg1.0_0037001F", //PidTagSubject Unicode
|
||||
};
|
||||
for (String entryName : outlookEntryNames) {
|
||||
if (poifsDir.hasEntry(entryName)) {
|
||||
return new OutlookTextExtactor(poifsDir);
|
||||
}
|
||||
}
|
||||
|
||||
// First, check for OOXML
|
||||
for (String entryName : poifsDir.getEntryNames()) {
|
||||
if (entryName.equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||
return createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
|
||||
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
||||
return OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -150,6 +150,7 @@ public class TestExtractorFactory {
|
||||
|
||||
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
@ -163,6 +164,7 @@ public class TestExtractorFactory {
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(xltx);
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
@ -340,6 +342,7 @@ public class TestExtractorFactory {
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
@ -359,6 +362,7 @@ public class TestExtractorFactory {
|
||||
// Word
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof WordExtractor
|
||||
);
|
||||
@ -369,6 +373,7 @@ public class TestExtractorFactory {
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
@ -379,6 +384,7 @@ public class TestExtractorFactory {
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
||||
assertTrue(
|
||||
extractor.getClass().getName(),
|
||||
extractor
|
||||
instanceof Word6Extractor
|
||||
);
|
||||
|
@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory {
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*/
|
||||
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
|
||||
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if(root == null) {
|
||||
|
Loading…
Reference in New Issue
Block a user