diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 05cad8af9..6cffd39fd 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -72,180 +72,180 @@ import org.apache.xmlbeans.XmlException; */ @SuppressWarnings("WeakerAccess") public class ExtractorFactory { - public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; - protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; - protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; + public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; + protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; + protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; - /** - * Should this thread prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is false. - */ - public static boolean getThreadPrefersEventExtractors() { - return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); - } + /** + * Should this thread prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. + */ + public static boolean getThreadPrefersEventExtractors() { + return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); + } - /** - * Should all threads prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is to use the thread level setting, which defaults to false. - */ - public static Boolean getAllThreadsPreferEventExtractors() { - return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); - } + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. + */ + public static Boolean getAllThreadsPreferEventExtractors() { + return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); + } - /** - * Should this thread prefer event based over usermodel based extractors? - * Will only be used if the All Threads setting is null. - */ - public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { - OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); - } + /** + * Should this thread prefer event based over usermodel based extractors? + * Will only be used if the All Threads setting is null. + */ + public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { + OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); + } - /** - * Should all threads prefer event based over usermodel based extractors? - * If set, will take preference over the Thread level setting. - */ - public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { - OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); - } + /** + * Should all threads prefer event based over usermodel based extractors? + * If set, will take preference over the Thread level setting. + */ + public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { + OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); + } - /** - * Should this thread use event based extractors is available? - * Checks the all-threads one first, then thread specific. - */ - protected static boolean getPreferEventExtractor() { - return OLE2ExtractorFactory.getPreferEventExtractor(); - } + /** + * Should this thread use event based extractors is available? + * Checks the all-threads one first, then thread specific. + */ + protected static boolean getPreferEventExtractor() { + return OLE2ExtractorFactory.getPreferEventExtractor(); + } - public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - NPOIFSFileSystem fs = null; + public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + NPOIFSFileSystem fs = null; try { fs = new NPOIFSFileSystem(f); POIOLE2TextExtractor extractor = createExtractor(fs); extractor.setFilesystem(fs); return extractor; + } catch (OfficeXmlFileException e) { // ensure file-handle release - IOUtils.closeQuietly(fs); - + IOUtils.closeQuietly(fs); return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); + } catch (NotOLE2FileException ne) { // ensure file-handle release - IOUtils.closeQuietly(fs); - + IOUtils.closeQuietly(fs); throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); - } catch (OpenXML4JException e) { - // ensure file-handle release - IOUtils.closeQuietly(fs); - throw e; - } catch (XmlException e) { - // ensure file-handle release - IOUtils.closeQuietly(fs); + } catch (OpenXML4JException e) { + // ensure file-handle release + IOUtils.closeQuietly(fs); + throw e; - throw e; - } catch (IOException e) { - // ensure file-handle release - IOUtils.closeQuietly(fs); + } catch (XmlException e) { + // ensure file-handle release + IOUtils.closeQuietly(fs); + throw e; + + } catch (IOException e) { + // ensure file-handle release + IOUtils.closeQuietly(fs); + throw e; - throw e; } catch (RuntimeException e) { - // ensure file-handle release - IOUtils.closeQuietly(fs); + // ensure file-handle release + IOUtils.closeQuietly(fs); + throw e; + } + } - throw e; - } + public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + // Figure out the kind of stream + // If clearly doesn't do mark/reset, wrap up + if (! inp.markSupported()) { + inp = new PushbackInputStream(inp, 8); + } + + if (NPOIFSFileSystem.hasPOIFSHeader(inp)) { + return createExtractor(new NPOIFSFileSystem(inp)); + } + if (DocumentFactoryHelper.hasOOXMLHeader(inp)) { + return createExtractor(OPCPackage.open(inp)); + } + throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); } - public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - // Figure out the kind of stream - // If clearly doesn't do mark/reset, wrap up - if(! inp.markSupported()) { - inp = new PushbackInputStream(inp, 8); - } - - if(NPOIFSFileSystem.hasPOIFSHeader(inp)) { - return createExtractor(new NPOIFSFileSystem(inp)); - } - if(DocumentFactoryHelper.hasOOXMLHeader(inp)) { - return createExtractor(OPCPackage.open(inp)); - } - throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); - } - - /** - * Tries to determine the actual type of file and produces a matching text-extractor for it. - * - * @param pkg An {@link OPCPackage}. - * @return A {@link POIXMLTextExtractor} for the given file. - * @throws IOException If an error occurs while reading the file - * @throws OpenXML4JException If an error parsing the OpenXML file format is found. - * @throws XmlException If an XML parsing error occurs. - * @throws IllegalArgumentException If no matching file type could be found. - */ - public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { + /** + * Tries to determine the actual type of file and produces a matching text-extractor for it. + * + * @param pkg An {@link OPCPackage}. + * @return A {@link POIXMLTextExtractor} for the given file. + * @throws IOException If an error occurs while reading the file + * @throws OpenXML4JException If an error parsing the OpenXML file format is found. + * @throws XmlException If an XML parsing error occurs. + * @throws IllegalArgumentException If no matching file type could be found. + */ + public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { try { - // Check for the normal Office core document - PackageRelationshipCollection core = - pkg.getRelationshipsByType(CORE_DOCUMENT_REL); - - // If nothing was found, try some of the other OOXML-based core types - if (core.size() == 0) { - // Could it be an OOXML-Strict one? - core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); - } - if (core.size() == 0) { - // Could it be a visio one? - core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); - if (core.size() == 1) - return new XDGFVisioExtractor(pkg); - } - - // Should just be a single core document, complain if not - if (core.size() != 1) { - throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); - } - - // Grab the core document part, and try to identify from that - PackagePart corePart = pkg.getPart(core.getRelationship(0)); - - // Is it XSSF? - for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - if(getPreferEventExtractor()) { - return new XSSFEventBasedExcelExtractor(pkg); - } - - return new XSSFExcelExtractor(pkg); - } - } - - // Is it XWPF? - for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - return new XWPFWordExtractor(pkg); - } - } - - // Is it XSLF? - for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { - if(corePart.getContentType().equals(rel.getContentType())) { - return new XSLFPowerPointExtractor(pkg); - } - } - - // special handling for SlideShow-Theme-files, - if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { - return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); - } - - throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); - } catch (IOException e) { - // ensure that we close the package again if there is an error opening it, however - // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! - pkg.revert(); - throw e; + // Check for the normal Office core document + PackageRelationshipCollection core; + core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); + + // If nothing was found, try some of the other OOXML-based core types + if (core.size() == 0) { + // Could it be an OOXML-Strict one? + core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); + } + if (core.size() == 0) { + // Could it be a visio one? + core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); + if (core.size() == 1) + return new XDGFVisioExtractor(pkg); + } + + // Should just be a single core document, complain if not + if (core.size() != 1) { + throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); + } + + // Grab the core document part, and try to identify from that + PackagePart corePart = pkg.getPart(core.getRelationship(0)); + + // Is it XSSF? + for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { + if (corePart.getContentType().equals(rel.getContentType())) { + if (getPreferEventExtractor()) { + return new XSSFEventBasedExcelExtractor(pkg); + } + return new XSSFExcelExtractor(pkg); + } + } + + // Is it XWPF? + for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { + if (corePart.getContentType().equals(rel.getContentType())) { + return new XWPFWordExtractor(pkg); + } + } + + // Is it XSLF? + for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { + if (corePart.getContentType().equals(rel.getContentType())) { + return new XSLFPowerPointExtractor(pkg); + } + } + + // special handling for SlideShow-Theme-files, + if (XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) { + return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + } + + throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); + + } catch (IOException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + throw e; } catch (OpenXML4JException e) { // ensure that we close the package again if there is an error opening it, however // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! @@ -256,27 +256,25 @@ public class ExtractorFactory { // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! pkg.revert(); throw e; - } catch (RuntimeException e) { - // ensure that we close the package again if there is an error opening it, however - // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! - pkg.revert(); - - throw e; - } - } + } catch (RuntimeException e) { + // ensure that we close the package again if there is an error opening it, however + // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! + pkg.revert(); + throw e; + } + } - public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - return OLE2ExtractorFactory.createExtractor(fs); - } + public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + return OLE2ExtractorFactory.createExtractor(fs); + } public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { return OLE2ExtractorFactory.createExtractor(fs); - } + } public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { return OLE2ExtractorFactory.createExtractor(fs); - } + } - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, - OpenXML4JException, XmlException + public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException { // First, check for OOXML for (String entryName : poifsDir.getEntryNames()) { @@ -285,104 +283,102 @@ public class ExtractorFactory { return createExtractor(pkg); } } - + // If not, ask the OLE2 code to check, with Scratchpad if possible return OLE2ExtractorFactory.createExtractor(poifsDir); } - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - */ - public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { - // All the embedded directories we spotted - ArrayList dirs = new ArrayList(); - // For anything else not directly held in as a POIFS directory - ArrayList nonPOIFS = new ArrayList(); + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + */ + public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { + // All the embedded directories we spotted + ArrayList dirs = new ArrayList(); + // For anything else not directly held in as a POIFS directory + ArrayList nonPOIFS = new ArrayList(); - // Find all the embedded directories - DirectoryEntry root = ext.getRoot(); - if(root == null) { - throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); - } + // Find all the embedded directories + DirectoryEntry root = ext.getRoot(); + if (root == null) { + throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); + } - if(ext instanceof ExcelExtractor) { - // These are in MBD... under the root - Iterator it = root.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("MBD")) { - dirs.add(entry); - } - } - } else if(ext instanceof WordExtractor) { - // These are in ObjectPool -> _... under the root - try { - DirectoryEntry op = (DirectoryEntry) - root.getEntry("ObjectPool"); - Iterator it = op.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("_")) { - dirs.add(entry); - } - } - } catch(FileNotFoundException e) { + if (ext instanceof ExcelExtractor) { + // These are in MBD... under the root + Iterator it = root.getEntries(); + while (it.hasNext()) { + Entry entry = it.next(); + if (entry.getName().startsWith("MBD")) { + dirs.add(entry); + } + } + } else if (ext instanceof WordExtractor) { + // These are in ObjectPool -> _... under the root + try { + DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); + Iterator it = op.getEntries(); + while (it.hasNext()) { + Entry entry = it.next(); + if (entry.getName().startsWith("_")) { + dirs.add(entry); + } + } + } catch (FileNotFoundException e) { // ignored here } - //} else if(ext instanceof PowerPointExtractor) { - // Tricky, not stored directly in poifs - // TODO - } else if(ext instanceof OutlookTextExtactor) { - // Stored in the Attachment blocks - MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); - for(AttachmentChunks attachment : msg.getAttachmentFiles()) { - if(attachment.attachData != null) { - byte[] data = attachment.attachData.getValue(); - nonPOIFS.add( new ByteArrayInputStream(data) ); - } else if(attachment.attachmentDirectory != null) { - dirs.add(attachment.attachmentDirectory.getDirectory()); - } - } - } + //} else if(ext instanceof PowerPointExtractor) { + // Tricky, not stored directly in poifs + // TODO + } else if (ext instanceof OutlookTextExtactor) { + // Stored in the Attachment blocks + MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); + for (AttachmentChunks attachment : msg.getAttachmentFiles()) { + if (attachment.attachData != null) { + byte[] data = attachment.attachData.getValue(); + nonPOIFS.add( new ByteArrayInputStream(data) ); + } else if (attachment.attachmentDirectory != null) { + dirs.add(attachment.attachmentDirectory.getDirectory()); + } + } + } - // Create the extractors - if(dirs.size() == 0 && nonPOIFS.size() == 0){ - return new POITextExtractor[0]; - } + // Create the extractors + if (dirs.size() == 0 && nonPOIFS.size() == 0){ + return new POITextExtractor[0]; + } - ArrayList e = new ArrayList(); + ArrayList e = new ArrayList(); for (Entry dir : dirs) { - e.add(createExtractor( - (DirectoryNode) dir - )); + e.add(createExtractor((DirectoryNode) dir)); } for (InputStream nonPOIF : nonPOIFS) { try { - e.add(createExtractor(nonPOIF)); + e.add(createExtractor(nonPOIF)); } catch (IllegalArgumentException ie) { // Ignore, just means it didn't contain // a format we support as yet } catch (XmlException xe) { - throw new IOException(xe.getMessage()); + throw new IOException(xe.getMessage()); } catch (OpenXML4JException oe) { - throw new IOException(oe.getMessage()); + throw new IOException(oe.getMessage()); } } - return e.toArray(new POITextExtractor[e.size()]); - } + return e.toArray(new POITextExtractor[e.size()]); + } - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - */ - public static POITextExtractor[] getEmbededDocsTextExtractors(@SuppressWarnings("UnusedParameters") POIXMLTextExtractor ext) { - throw new IllegalStateException("Not yet supported"); - } + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + */ + @SuppressWarnings("UnusedParameters") + public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) { + throw new IllegalStateException("Not yet supported"); + } }