Start moving logic over into the main and scratchpad jars for OLE2

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752226 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2016-07-11 22:47:02 +00:00
parent a5fb04f45f
commit 7179c813e6
4 changed files with 79 additions and 92 deletions

View File

@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/
@SuppressWarnings("WeakerAccess")
public class OLE2ExtractorFactory {
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
@Override
@ -115,11 +121,38 @@ public class OLE2ExtractorFactory {
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(InputStream input) {
// TODO Something nasty with reflection...
return null;
public static POITextExtractor createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass();
if (cls != null) {
// TODO Reflection
throw new IllegalArgumentException("TODO Reflection");
} else {
// Best hope it's OLE2....
return createExtractor(new NPOIFSFileSystem(input));
}
}
private static Class<?> getOOXMLClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.ExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
return null;
}
}
private static Class<?> getScratchpadClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
}
}
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
@ -138,8 +171,16 @@ public class OLE2ExtractorFactory {
return new ExcelExtractor(poifsDir);
}
}
// TODO Try to ask the Scratchpad
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
if (ext != null) return ext;
} catch (Exception e) {
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
@ -155,9 +196,9 @@ public class OLE2ExtractorFactory {
throws IOException
{
// All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
@ -175,7 +216,15 @@ public class OLE2ExtractorFactory {
}
}
} else {
// TODO Ask scratchpad
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod(
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
}
// Create the extractors
@ -195,10 +244,10 @@ public class OLE2ExtractorFactory {
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
// TODO Should we log this?
LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) {
// Ignore, invalid format
// TODO Should we log this?
LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[e.size()]);

View File

@ -78,23 +78,13 @@ public class ExtractorFactory {
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
@Override
protected Boolean initialValue() { return Boolean.FALSE; }
};
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get();
return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
}
/**
@ -103,7 +93,7 @@ public class ExtractorFactory {
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors;
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
}
/**
@ -111,7 +101,7 @@ public class ExtractorFactory {
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors);
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
}
/**
@ -119,7 +109,7 @@ public class ExtractorFactory {
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors;
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
}
/**
@ -127,10 +117,7 @@ public class ExtractorFactory {
* Checks the all-threads one first, then thread specific.
*/
protected static boolean getPreferEventExtractor() {
if(allPreferEventExtractors != null) {
return allPreferEventExtractors;
}
return threadPreferEventExtractors.get();
return OLE2ExtractorFactory.getPreferEventExtractor();
}
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@ -281,83 +268,28 @@ public class ExtractorFactory {
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
return OLE2ExtractorFactory.createExtractor(fs);
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
return OLE2ExtractorFactory.createExtractor(fs);
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
return OLE2ExtractorFactory.createExtractor(fs);
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
OpenXML4JException, XmlException
{
// Look for certain entries in the stream, to figure it
// out from
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
if (poifsDir.hasEntry(workbookName)) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
return new ExcelExtractor(poifsDir);
}
}
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
+ "found. Please call OldExcelExtractor directly for basic text extraction");
}
if (poifsDir.hasEntry("WordDocument")) {
// Old or new style word document?
try {
return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
return new Word6Extractor(poifsDir);
}
}
if (poifsDir.hasEntry("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir);
}
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
final String[] outlookEntryNames = new String[] {
// message bodies, saved as plain text (PtypString)
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
// @see org.apache.poi.hsmf.Types.MAPIType
"__substg1.0_1000001E", //PidTagBody ASCII
"__substg1.0_1000001F", //PidTagBody Unicode
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
"__substg1.0_0037001E", //PidTagSubject ASCII
"__substg1.0_0037001F", //PidTagSubject Unicode
};
for (String entryName : outlookEntryNames) {
if (poifsDir.hasEntry(entryName)) {
return new OutlookTextExtactor(poifsDir);
}
}
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
// If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**

View File

@ -150,6 +150,7 @@ public class TestExtractorFactory {
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
@ -163,6 +164,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
@ -340,6 +342,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
@ -359,6 +362,7 @@ public class TestExtractorFactory {
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof WordExtractor
);
@ -369,6 +373,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
@ -379,6 +384,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);

View File

@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory {
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if(root == null) {