Start to pull out some of the OLE2 logic, so we can then split some Scratchpad parts out
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f94e7e761b
commit
ed7940dee5
208
src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
Normal file
208
src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.extractor;
|
||||||
|
|
||||||
|
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
|
import org.apache.poi.POITextExtractor;
|
||||||
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||||
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.Entry;
|
||||||
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Figures out the correct POIOLE2TextExtractor for your supplied
|
||||||
|
* document, and returns it.
|
||||||
|
*
|
||||||
|
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
|
||||||
|
* not present on the runtime classpath</p>
|
||||||
|
* <p>Note 2 - for text extractor creation across all formats, use
|
||||||
|
* {@link org.apache.poi.extractor.ExtractorFactory} contained within
|
||||||
|
* the OOXML jar.</p>
|
||||||
|
* <p>Note 3 - rather than using this, for most cases you would be better
|
||||||
|
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("WeakerAccess")
|
||||||
|
public class OLE2ExtractorFactory {
|
||||||
|
/** Should this thread prefer event based over usermodel based extractors? */
|
||||||
|
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||||
|
@Override
|
||||||
|
protected Boolean initialValue() { return Boolean.FALSE; }
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Should all threads prefer event based over usermodel based extractors? */
|
||||||
|
private static Boolean allPreferEventExtractors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread prefer event based over usermodel based extractors?
|
||||||
|
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||||
|
* Default is false.
|
||||||
|
*/
|
||||||
|
public static boolean getThreadPrefersEventExtractors() {
|
||||||
|
return threadPreferEventExtractors.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should all threads prefer event based over usermodel based extractors?
|
||||||
|
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||||
|
* Default is to use the thread level setting, which defaults to false.
|
||||||
|
*/
|
||||||
|
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||||
|
return allPreferEventExtractors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread prefer event based over usermodel based extractors?
|
||||||
|
* Will only be used if the All Threads setting is null.
|
||||||
|
*/
|
||||||
|
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||||
|
threadPreferEventExtractors.set(preferEventExtractors);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should all threads prefer event based over usermodel based extractors?
|
||||||
|
* If set, will take preference over the Thread level setting.
|
||||||
|
*/
|
||||||
|
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||||
|
allPreferEventExtractors = preferEventExtractors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should this thread use event based extractors is available?
|
||||||
|
* Checks the all-threads one first, then thread specific.
|
||||||
|
*/
|
||||||
|
protected static boolean getPreferEventExtractor() {
|
||||||
|
if(allPreferEventExtractors != null) {
|
||||||
|
return allPreferEventExtractors;
|
||||||
|
}
|
||||||
|
return threadPreferEventExtractors.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
|
// Only ever an OLE2 one from the root of the FS
|
||||||
|
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||||
|
}
|
||||||
|
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
|
// Only ever an OLE2 one from the root of the FS
|
||||||
|
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||||
|
}
|
||||||
|
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
|
// Only ever an OLE2 one from the root of the FS
|
||||||
|
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static POITextExtractor createExtractor(InputStream input) {
|
||||||
|
// TODO Something nasty with reflection...
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
|
||||||
|
* Note that this won't check for embedded OOXML resources either, use
|
||||||
|
* {@link org.apache.poi.extractor.ExtractorFactory} for that.
|
||||||
|
*/
|
||||||
|
public static POITextExtractor createExtractor(DirectoryNode poifsDir)
|
||||||
|
throws IOException, OpenXML4JException, XmlException
|
||||||
|
{
|
||||||
|
// Look for certain entries in the stream, to figure it
|
||||||
|
// out from
|
||||||
|
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
|
||||||
|
if (poifsDir.hasEntry(workbookName)) {
|
||||||
|
if (getPreferEventExtractor()) {
|
||||||
|
return new EventBasedExcelExtractor(poifsDir);
|
||||||
|
}
|
||||||
|
return new ExcelExtractor(poifsDir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO Try to ask the Scratchpad
|
||||||
|
|
||||||
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of text extractors, one for each of
|
||||||
|
* the embedded documents in the file (if there are any).
|
||||||
|
* If there are no embedded documents, you'll get back an
|
||||||
|
* empty array. Otherwise, you'll get one open
|
||||||
|
* {@link POITextExtractor} for each embedded file.
|
||||||
|
*/
|
||||||
|
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext)
|
||||||
|
throws IOException, OpenXML4JException, XmlException
|
||||||
|
{
|
||||||
|
// All the embedded directories we spotted
|
||||||
|
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||||
|
// For anything else not directly held in as a POIFS directory
|
||||||
|
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||||
|
|
||||||
|
// Find all the embedded directories
|
||||||
|
DirectoryEntry root = ext.getRoot();
|
||||||
|
if(root == null) {
|
||||||
|
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||||
|
}
|
||||||
|
|
||||||
|
if(ext instanceof ExcelExtractor) {
|
||||||
|
// These are in MBD... under the root
|
||||||
|
Iterator<Entry> it = root.getEntries();
|
||||||
|
while(it.hasNext()) {
|
||||||
|
Entry entry = it.next();
|
||||||
|
if(entry.getName().startsWith("MBD")) {
|
||||||
|
dirs.add(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// TODO Ask scratchpad
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the extractors
|
||||||
|
if(dirs.size() == 0 && nonPOIFS.size() == 0){
|
||||||
|
return new POITextExtractor[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
|
||||||
|
for (Entry dir : dirs) {
|
||||||
|
e.add(createExtractor(
|
||||||
|
(DirectoryNode) dir
|
||||||
|
));
|
||||||
|
}
|
||||||
|
for (InputStream nonPOIF : nonPOIFS) {
|
||||||
|
try {
|
||||||
|
e.add(createExtractor(nonPOIF));
|
||||||
|
} catch (IllegalArgumentException ie) {
|
||||||
|
// Ignore, just means it didn't contain
|
||||||
|
// a format we support as yet
|
||||||
|
// TODO Should we log this?
|
||||||
|
} catch (Exception xe) {
|
||||||
|
// Ignore, invalid format
|
||||||
|
// TODO Should we log this?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return e.toArray(new POITextExtractor[e.size()]);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user