diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java index 0db450aaf..9facd3979 100644 --- a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java +++ b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java @@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; @@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Figures out the correct POIOLE2TextExtractor for your supplied @@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; */ @SuppressWarnings("WeakerAccess") public class OLE2ExtractorFactory { + private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class); + /** Should this thread prefer event based over usermodel based extractors? */ private static final ThreadLocal threadPreferEventExtractors = new ThreadLocal() { @Override @@ -115,11 +121,38 @@ public class OLE2ExtractorFactory { return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); } - public static POITextExtractor createExtractor(InputStream input) { - // TODO Something nasty with reflection... - return null; + public static POITextExtractor createExtractor(InputStream input) throws IOException { + Class cls = getOOXMLClass(); + if (cls != null) { + // TODO Reflection + throw new IllegalArgumentException("TODO Reflection"); + } else { + // Best hope it's OLE2.... + return createExtractor(new NPOIFSFileSystem(input)); + } } + private static Class getOOXMLClass() { + try { + return OLE2ExtractorFactory.class.getClassLoader().loadClass( + "org.apache.poi.extractor.ExtractorFactory" + ); + } catch (ClassNotFoundException e) { + LOGGER.log(POILogger.WARN, "POI OOXML jar missing"); + return null; + } + } + private static Class getScratchpadClass() { + try { + return OLE2ExtractorFactory.class.getClassLoader().loadClass( + "org.apache.poi.extractor.OLE2ScrachpadExtractorFactory" + ); + } catch (ClassNotFoundException e) { + LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing"); + throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory"); + } + } + /** * Create the Extractor, if possible. Generally needs the Scratchpad jar. * Note that this won't check for embedded OOXML resources either, use @@ -138,8 +171,16 @@ public class OLE2ExtractorFactory { return new ExcelExtractor(poifsDir); } } - - // TODO Try to ask the Scratchpad + + // Ask Scratchpad, or fail trying + Class cls = getScratchpadClass(); + try { + Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class); + POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir); + if (ext != null) return ext; + } catch (Exception e) { + throw new IllegalArgumentException("Error creating Scratchpad Extractor", e); + } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } @@ -155,9 +196,9 @@ public class OLE2ExtractorFactory { throws IOException { // All the embedded directories we spotted - ArrayList dirs = new ArrayList(); + List dirs = new ArrayList(); // For anything else not directly held in as a POIFS directory - ArrayList nonPOIFS = new ArrayList(); + List nonPOIFS = new ArrayList(); // Find all the embedded directories DirectoryEntry root = ext.getRoot(); @@ -175,7 +216,15 @@ public class OLE2ExtractorFactory { } } } else { - // TODO Ask scratchpad + // Ask Scratchpad, or fail trying + Class cls = getScratchpadClass(); + try { + Method m = cls.getDeclaredMethod( + "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class); + m.invoke(null, ext, dirs, nonPOIFS); + } catch (Exception e) { + throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e); + } } // Create the extractors @@ -195,10 +244,10 @@ public class OLE2ExtractorFactory { } catch (IllegalArgumentException ie) { // Ignore, just means it didn't contain // a format we support as yet - // TODO Should we log this? + LOGGER.log(POILogger.WARN, ie); } catch (Exception xe) { // Ignore, invalid format - // TODO Should we log this? + LOGGER.log(POILogger.WARN, xe); } } return e.toArray(new POITextExtractor[e.size()]); diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 4ba8d8f2f..830a4d82d 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -78,23 +78,13 @@ public class ExtractorFactory { protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; - - /** Should this thread prefer event based over usermodel based extractors? */ - private static final ThreadLocal threadPreferEventExtractors = new ThreadLocal() { - @Override - protected Boolean initialValue() { return Boolean.FALSE; } - }; - - /** Should all threads prefer event based over usermodel based extractors? */ - private static Boolean allPreferEventExtractors; - /** * Should this thread prefer event based over usermodel based extractors? * (usermodel extractors tend to be more accurate, but use more memory) * Default is false. */ public static boolean getThreadPrefersEventExtractors() { - return threadPreferEventExtractors.get(); + return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); } /** @@ -103,7 +93,7 @@ public class ExtractorFactory { * Default is to use the thread level setting, which defaults to false. */ public static Boolean getAllThreadsPreferEventExtractors() { - return allPreferEventExtractors; + return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); } /** @@ -111,7 +101,7 @@ public class ExtractorFactory { * Will only be used if the All Threads setting is null. */ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { - threadPreferEventExtractors.set(preferEventExtractors); + OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); } /** @@ -119,7 +109,7 @@ public class ExtractorFactory { * If set, will take preference over the Thread level setting. */ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { - allPreferEventExtractors = preferEventExtractors; + OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); } /** @@ -127,10 +117,7 @@ public class ExtractorFactory { * Checks the all-threads one first, then thread specific. */ protected static boolean getPreferEventExtractor() { - if(allPreferEventExtractors != null) { - return allPreferEventExtractors; - } - return threadPreferEventExtractors.get(); + return OLE2ExtractorFactory.getPreferEventExtractor(); } public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { @@ -281,83 +268,28 @@ public class ExtractorFactory { } public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + return OLE2ExtractorFactory.createExtractor(fs); } public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + return OLE2ExtractorFactory.createExtractor(fs); } public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + return OLE2ExtractorFactory.createExtractor(fs); } public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException { - // Look for certain entries in the stream, to figure it - // out from - for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { - if (poifsDir.hasEntry(workbookName)) { - if (getPreferEventExtractor()) { - return new EventBasedExcelExtractor(poifsDir); - } - return new ExcelExtractor(poifsDir); - } - } - if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) { - throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) " - + "found. Please call OldExcelExtractor directly for basic text extraction"); - } - - if (poifsDir.hasEntry("WordDocument")) { - // Old or new style word document? - try { - return new WordExtractor(poifsDir); - } catch (OldWordFileFormatException e) { - return new Word6Extractor(poifsDir); - } - } - - if (poifsDir.hasEntry("PowerPoint Document")) { - return new PowerPointExtractor(poifsDir); - } - - if (poifsDir.hasEntry("VisioDocument")) { - return new VisioTextExtractor(poifsDir); - } - - if (poifsDir.hasEntry("Quill")) { - return new PublisherTextExtractor(poifsDir); - } - - final String[] outlookEntryNames = new String[] { - // message bodies, saved as plain text (PtypString) - // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) - // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry - // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx - // @see org.apache.poi.hsmf.Types.MAPIType - "__substg1.0_1000001E", //PidTagBody ASCII - "__substg1.0_1000001F", //PidTagBody Unicode - "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII - "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode - "__substg1.0_0037001E", //PidTagSubject ASCII - "__substg1.0_0037001F", //PidTagSubject Unicode - }; - for (String entryName : outlookEntryNames) { - if (poifsDir.hasEntry(entryName)) { - return new OutlookTextExtactor(poifsDir); - } - } - + // First, check for OOXML for (String entryName : poifsDir.getEntryNames()) { if (entryName.equals("Package")) { OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); return createExtractor(pkg); } } - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + + // If not, ask the OLE2 code to check, with Scratchpad if possible + return OLE2ExtractorFactory.createExtractor(poifsDir); } /** diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 467bb31fd..0aa022332 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -150,6 +150,7 @@ public class TestExtractorFactory { POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); assertTrue( + extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor ); @@ -163,6 +164,7 @@ public class TestExtractorFactory { extractor = ExtractorFactory.createExtractor(xltx); assertTrue( + extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor ); @@ -340,6 +342,7 @@ public class TestExtractorFactory { extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx)); assertTrue( + extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor ); @@ -359,6 +362,7 @@ public class TestExtractorFactory { // Word extractor = ExtractorFactory.createExtractor(new FileInputStream(doc)); assertTrue( + extractor.getClass().getName(), extractor instanceof WordExtractor ); @@ -369,6 +373,7 @@ public class TestExtractorFactory { extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6)); assertTrue( + extractor.getClass().getName(), extractor instanceof Word6Extractor ); @@ -379,6 +384,7 @@ public class TestExtractorFactory { extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95)); assertTrue( + extractor.getClass().getName(), extractor instanceof Word6Extractor ); diff --git a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java index 90ede6236..b1e52f4d7 100644 --- a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java +++ b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java @@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; @@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory { * empty array. Otherwise, you'll get one open * {@link POITextExtractor} for each embedded file. */ - public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList dirs, ArrayList nonPOIFS) throws IOException { + public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException { // Find all the embedded directories DirectoryEntry root = ext.getRoot(); if(root == null) {