whitespace

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1753028 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Javen O'Neal 2016-07-17 08:26:51 +00:00
parent e0c50807c4
commit 1cf76af9e8

View File

@ -72,180 +72,180 @@ import org.apache.xmlbeans.XmlException;
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class ExtractorFactory { public class ExtractorFactory {
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
/** /**
* Should this thread prefer event based over usermodel based extractors? * Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory) * (usermodel extractors tend to be more accurate, but use more memory)
* Default is false. * Default is false.
*/ */
public static boolean getThreadPrefersEventExtractors() { public static boolean getThreadPrefersEventExtractors() {
return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
} }
/** /**
* Should all threads prefer event based over usermodel based extractors? * Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory) * (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false. * Default is to use the thread level setting, which defaults to false.
*/ */
public static Boolean getAllThreadsPreferEventExtractors() { public static Boolean getAllThreadsPreferEventExtractors() {
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
} }
/** /**
* Should this thread prefer event based over usermodel based extractors? * Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null. * Will only be used if the All Threads setting is null.
*/ */
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
} }
/** /**
* Should all threads prefer event based over usermodel based extractors? * Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting. * If set, will take preference over the Thread level setting.
*/ */
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
} }
/** /**
* Should this thread use event based extractors is available? * Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific. * Checks the all-threads one first, then thread specific.
*/ */
protected static boolean getPreferEventExtractor() { protected static boolean getPreferEventExtractor() {
return OLE2ExtractorFactory.getPreferEventExtractor(); return OLE2ExtractorFactory.getPreferEventExtractor();
} }
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null; NPOIFSFileSystem fs = null;
try { try {
fs = new NPOIFSFileSystem(f); fs = new NPOIFSFileSystem(f);
POIOLE2TextExtractor extractor = createExtractor(fs); POIOLE2TextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs); extractor.setFilesystem(fs);
return extractor; return extractor;
} catch (OfficeXmlFileException e) { } catch (OfficeXmlFileException e) {
// ensure file-handle release // ensure file-handle release
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) { } catch (NotOLE2FileException ne) {
// ensure file-handle release // ensure file-handle release
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
} catch (OpenXML4JException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e; } catch (OpenXML4JException e) {
} catch (XmlException e) { // ensure file-handle release
// ensure file-handle release IOUtils.closeQuietly(fs);
IOUtils.closeQuietly(fs); throw e;
throw e; } catch (XmlException e) {
} catch (IOException e) { // ensure file-handle release
// ensure file-handle release IOUtils.closeQuietly(fs);
IOUtils.closeQuietly(fs); throw e;
} catch (IOException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
throw e;
} catch (RuntimeException e) { } catch (RuntimeException e) {
// ensure file-handle release // ensure file-handle release
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
throw e;
}
}
throw e; public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
} // Figure out the kind of stream
// If clearly doesn't do mark/reset, wrap up
if (! inp.markSupported()) {
inp = new PushbackInputStream(inp, 8);
}
if (NPOIFSFileSystem.hasPOIFSHeader(inp)) {
return createExtractor(new NPOIFSFileSystem(inp));
}
if (DocumentFactoryHelper.hasOOXMLHeader(inp)) {
return createExtractor(OPCPackage.open(inp));
}
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
} }
public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { /**
// Figure out the kind of stream * Tries to determine the actual type of file and produces a matching text-extractor for it.
// If clearly doesn't do mark/reset, wrap up *
if(! inp.markSupported()) { * @param pkg An {@link OPCPackage}.
inp = new PushbackInputStream(inp, 8); * @return A {@link POIXMLTextExtractor} for the given file.
} * @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
if(NPOIFSFileSystem.hasPOIFSHeader(inp)) { * @throws XmlException If an XML parsing error occurs.
return createExtractor(new NPOIFSFileSystem(inp)); * @throws IllegalArgumentException If no matching file type could be found.
} */
if(DocumentFactoryHelper.hasOOXMLHeader(inp)) { public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
return createExtractor(OPCPackage.open(inp));
}
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
}
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try { try {
// Check for the normal Office core document // Check for the normal Office core document
PackageRelationshipCollection core = PackageRelationshipCollection core;
pkg.getRelationshipsByType(CORE_DOCUMENT_REL); core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types // If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) { if (core.size() == 0) {
// Could it be an OOXML-Strict one? // Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
} }
if (core.size() == 0) { if (core.size() == 0) {
// Could it be a visio one? // Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1) if (core.size() == 1)
return new XDGFVisioExtractor(pkg); return new XDGFVisioExtractor(pkg);
} }
// Should just be a single core document, complain if not // Should just be a single core document, complain if not
if (core.size() != 1) { if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
} }
// Grab the core document part, and try to identify from that // Grab the core document part, and try to identify from that
PackagePart corePart = pkg.getPart(core.getRelationship(0)); PackagePart corePart = pkg.getPart(core.getRelationship(0));
// Is it XSSF? // Is it XSSF?
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) { if (corePart.getContentType().equals(rel.getContentType())) {
if(getPreferEventExtractor()) { if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg); return new XSSFEventBasedExcelExtractor(pkg);
} }
return new XSSFExcelExtractor(pkg);
return new XSSFExcelExtractor(pkg); }
} }
}
// Is it XWPF?
// Is it XWPF? for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { if (corePart.getContentType().equals(rel.getContentType())) {
if(corePart.getContentType().equals(rel.getContentType())) { return new XWPFWordExtractor(pkg);
return new XWPFWordExtractor(pkg); }
} }
}
// Is it XSLF?
// Is it XSLF? for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { if (corePart.getContentType().equals(rel.getContentType())) {
if(corePart.getContentType().equals(rel.getContentType())) { return new XSLFPowerPointExtractor(pkg);
return new XSLFPowerPointExtractor(pkg); }
} }
}
// special handling for SlideShow-Theme-files,
// special handling for SlideShow-Theme-files, if (XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); }
}
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
} catch (IOException e) { } catch (IOException e) {
// ensure that we close the package again if there is an error opening it, however // ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert(); pkg.revert();
throw e; throw e;
} catch (OpenXML4JException e) { } catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however // ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
@ -256,27 +256,25 @@ public class ExtractorFactory {
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert(); pkg.revert();
throw e; throw e;
} catch (RuntimeException e) { } catch (RuntimeException e) {
// ensure that we close the package again if there is an error opening it, however // ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert(); pkg.revert();
throw e;
throw e; }
} }
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return OLE2ExtractorFactory.createExtractor(fs);
} }
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return OLE2ExtractorFactory.createExtractor(fs);
} }
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs); return OLE2ExtractorFactory.createExtractor(fs);
} }
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
OpenXML4JException, XmlException
{ {
// First, check for OOXML // First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) { for (String entryName : poifsDir.getEntryNames()) {
@ -285,104 +283,102 @@ public class ExtractorFactory {
return createExtractor(pkg); return createExtractor(pkg);
} }
} }
// If not, ask the OLE2 code to check, with Scratchpad if possible // If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir); return OLE2ExtractorFactory.createExtractor(poifsDir);
} }
/** /**
* Returns an array of text extractors, one for each of * Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any). * the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an * If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file. * {@link POITextExtractor} for each embedded file.
*/ */
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
// All the embedded directories we spotted // All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>(); ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory // For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>(); ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories // Find all the embedded directories
DirectoryEntry root = ext.getRoot(); DirectoryEntry root = ext.getRoot();
if(root == null) { if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
} }
if(ext instanceof ExcelExtractor) { if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root // These are in MBD... under the root
Iterator<Entry> it = root.getEntries(); Iterator<Entry> it = root.getEntries();
while(it.hasNext()) { while (it.hasNext()) {
Entry entry = it.next(); Entry entry = it.next();
if(entry.getName().startsWith("MBD")) { if (entry.getName().startsWith("MBD")) {
dirs.add(entry); dirs.add(entry);
} }
} }
} else if(ext instanceof WordExtractor) { } else if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root // These are in ObjectPool -> _... under the root
try { try {
DirectoryEntry op = (DirectoryEntry) DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
root.getEntry("ObjectPool"); Iterator<Entry> it = op.getEntries();
Iterator<Entry> it = op.getEntries(); while (it.hasNext()) {
while(it.hasNext()) { Entry entry = it.next();
Entry entry = it.next(); if (entry.getName().startsWith("_")) {
if(entry.getName().startsWith("_")) { dirs.add(entry);
dirs.add(entry); }
} }
} } catch (FileNotFoundException e) {
} catch(FileNotFoundException e) {
// ignored here // ignored here
} }
//} else if(ext instanceof PowerPointExtractor) { //} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs // Tricky, not stored directly in poifs
// TODO // TODO
} else if(ext instanceof OutlookTextExtactor) { } else if (ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks // Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
for(AttachmentChunks attachment : msg.getAttachmentFiles()) { for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
if(attachment.attachData != null) { if (attachment.attachData != null) {
byte[] data = attachment.attachData.getValue(); byte[] data = attachment.attachData.getValue();
nonPOIFS.add( new ByteArrayInputStream(data) ); nonPOIFS.add( new ByteArrayInputStream(data) );
} else if(attachment.attachmentDirectory != null) { } else if (attachment.attachmentDirectory != null) {
dirs.add(attachment.attachmentDirectory.getDirectory()); dirs.add(attachment.attachmentDirectory.getDirectory());
} }
} }
} }
// Create the extractors // Create the extractors
if(dirs.size() == 0 && nonPOIFS.size() == 0){ if (dirs.size() == 0 && nonPOIFS.size() == 0){
return new POITextExtractor[0]; return new POITextExtractor[0];
} }
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>(); ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) { for (Entry dir : dirs) {
e.add(createExtractor( e.add(createExtractor((DirectoryNode) dir));
(DirectoryNode) dir
));
} }
for (InputStream nonPOIF : nonPOIFS) { for (InputStream nonPOIF : nonPOIFS) {
try { try {
e.add(createExtractor(nonPOIF)); e.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException ie) { } catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain // Ignore, just means it didn't contain
// a format we support as yet // a format we support as yet
} catch (XmlException xe) { } catch (XmlException xe) {
throw new IOException(xe.getMessage()); throw new IOException(xe.getMessage());
} catch (OpenXML4JException oe) { } catch (OpenXML4JException oe) {
throw new IOException(oe.getMessage()); throw new IOException(oe.getMessage());
} }
} }
return e.toArray(new POITextExtractor[e.size()]); return e.toArray(new POITextExtractor[e.size()]);
} }
/** /**
* Returns an array of text extractors, one for each of * Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any). * the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an * If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file. * {@link POITextExtractor} for each embedded file.
*/ */
public static POITextExtractor[] getEmbededDocsTextExtractors(@SuppressWarnings("UnusedParameters") POIXMLTextExtractor ext) { @SuppressWarnings("UnusedParameters")
throw new IllegalStateException("Not yet supported"); public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
} throw new IllegalStateException("Not yet supported");
}
} }