From ed7940dee58fec9918e876e8334402de675b6ae1 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Mon, 11 Jul 2016 22:20:51 +0000 Subject: [PATCH] Start to pull out some of the OLE2 logic, so we can then split some Scratchpad parts out git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752223 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/extractor/OLE2ExtractorFactory.java | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java new file mode 100644 index 000000000..ea78385b6 --- /dev/null +++ b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java @@ -0,0 +1,208 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.xmlbeans.XmlException; + +/** + * Figures out the correct POIOLE2TextExtractor for your supplied + * document, and returns it. + * + *

Note 1 - will fail for many file formats if the POI Scratchpad jar is + * not present on the runtime classpath

+ *

Note 2 - for text extractor creation across all formats, use + * {@link org.apache.poi.extractor.ExtractorFactory} contained within + * the OOXML jar.

+ *

Note 3 - rather than using this, for most cases you would be better + * off switching to Apache Tika instead!

+ */ +@SuppressWarnings("WeakerAccess") +public class OLE2ExtractorFactory { + /** Should this thread prefer event based over usermodel based extractors? */ + private static final ThreadLocal threadPreferEventExtractors = new ThreadLocal() { + @Override + protected Boolean initialValue() { return Boolean.FALSE; } + }; + + /** Should all threads prefer event based over usermodel based extractors? */ + private static Boolean allPreferEventExtractors; + + /** + * Should this thread prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. + */ + public static boolean getThreadPrefersEventExtractors() { + return threadPreferEventExtractors.get(); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. + */ + public static Boolean getAllThreadsPreferEventExtractors() { + return allPreferEventExtractors; + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * Will only be used if the All Threads setting is null. + */ + public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { + threadPreferEventExtractors.set(preferEventExtractors); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * If set, will take preference over the Thread level setting. + */ + public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { + allPreferEventExtractors = preferEventExtractors; + } + + /** + * Should this thread use event based extractors is available? + * Checks the all-threads one first, then thread specific. + */ + protected static boolean getPreferEventExtractor() { + if(allPreferEventExtractors != null) { + return allPreferEventExtractors; + } + return threadPreferEventExtractors.get(); + } + + public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + // Only ever an OLE2 one from the root of the FS + return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + } + public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + // Only ever an OLE2 one from the root of the FS + return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + } + public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + // Only ever an OLE2 one from the root of the FS + return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + } + + public static POITextExtractor createExtractor(InputStream input) { + // TODO Something nasty with reflection... + return null; + } + + /** + * Create the Extractor, if possible. Generally needs the Scratchpad jar. + * Note that this won't check for embedded OOXML resources either, use + * {@link org.apache.poi.extractor.ExtractorFactory} for that. + */ + public static POITextExtractor createExtractor(DirectoryNode poifsDir) + throws IOException, OpenXML4JException, XmlException + { + // Look for certain entries in the stream, to figure it + // out from + for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { + if (poifsDir.hasEntry(workbookName)) { + if (getPreferEventExtractor()) { + return new EventBasedExcelExtractor(poifsDir); + } + return new ExcelExtractor(poifsDir); + } + } + + // TODO Try to ask the Scratchpad + + throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + } + + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + */ + public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) + throws IOException, OpenXML4JException, XmlException + { + // All the embedded directories we spotted + ArrayList dirs = new ArrayList(); + // For anything else not directly held in as a POIFS directory + ArrayList nonPOIFS = new ArrayList(); + + // Find all the embedded directories + DirectoryEntry root = ext.getRoot(); + if(root == null) { + throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); + } + + if(ext instanceof ExcelExtractor) { + // These are in MBD... under the root + Iterator it = root.getEntries(); + while(it.hasNext()) { + Entry entry = it.next(); + if(entry.getName().startsWith("MBD")) { + dirs.add(entry); + } + } + } else { + // TODO Ask scratchpad + } + + // Create the extractors + if(dirs.size() == 0 && nonPOIFS.size() == 0){ + return new POITextExtractor[0]; + } + + ArrayList e = new ArrayList(); + for (Entry dir : dirs) { + e.add(createExtractor( + (DirectoryNode) dir + )); + } + for (InputStream nonPOIF : nonPOIFS) { + try { + e.add(createExtractor(nonPOIF)); + } catch (IllegalArgumentException ie) { + // Ignore, just means it didn't contain + // a format we support as yet + // TODO Should we log this? + } catch (Exception xe) { + // Ignore, invalid format + // TODO Should we log this? + } + } + return e.toArray(new POITextExtractor[e.size()]); + } +}