diff --git a/src/java/org/apache/poi/util/IOUtils.java b/src/java/org/apache/poi/util/IOUtils.java index eef58b30c..befebf444 100644 --- a/src/java/org/apache/poi/util/IOUtils.java +++ b/src/java/org/apache/poi/util/IOUtils.java @@ -79,9 +79,9 @@ public final class IOUtils { ByteArrayOutputStream baos = new ByteArrayOutputStream(length == Integer.MAX_VALUE ? 4096 : length); byte[] buffer = new byte[4096]; - int totalBytes = 0, readBytes = 0; + int totalBytes = 0, readBytes; do { - readBytes = stream.read(buffer, 0, Math.min(buffer.length, length-totalBytes)); + readBytes = stream.read(buffer, 0, Math.min(buffer.length, length-totalBytes)); totalBytes += Math.max(readBytes,0); if (readBytes > 0) { baos.write(buffer, 0, readBytes); @@ -218,6 +218,11 @@ public final class IOUtils { * resource to close */ public static void closeQuietly( final Closeable closeable ) { + // no need to log a NullPointerException here + if(closeable == null) { + return; + } + try { closeable.close(); } catch ( Exception exc ) { diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 3bc5dfaad..38cb5a833 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -30,7 +30,6 @@ import java.util.Iterator; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; -import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; @@ -52,6 +51,7 @@ import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; import org.apache.poi.poifs.filesystem.*; +import org.apache.poi.util.IOUtils; import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.usermodel.XSLFRelation; @@ -67,6 +67,7 @@ import org.apache.xmlbeans.XmlException; * Figures out the correct POITextExtractor for your supplied * document, and returns it. */ +@SuppressWarnings("WeakerAccess") public class ExtractorFactory { public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; @@ -136,39 +137,33 @@ public class ExtractorFactory { return extractor; } catch (OfficeXmlFileException e) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); } catch (NotOLE2FileException ne) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); } catch (OpenXML4JException e) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + throw e; } catch (XmlException e) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + throw e; } catch (IOException e) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + throw e; } catch (RuntimeException e) { // ensure file-handle release - if(fs != null) { - fs.close(); - } + IOUtils.closeQuietly(fs); + throw e; } } @@ -280,21 +275,21 @@ public class ExtractorFactory { } } - public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { // Only ever an OLE2 one from the root of the FS return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { // Only ever an OLE2 one from the root of the FS return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { // Only ever an OLE2 one from the root of the FS return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); } public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, - InvalidFormatException, OpenXML4JException, XmlException + OpenXML4JException, XmlException { // Look for certain entries in the stream, to figure it // out from @@ -359,7 +354,7 @@ public class ExtractorFactory { * empty array. Otherwise, you'll get one open * {@link POITextExtractor} for each embedded file. */ - public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { + public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { // All the embded directories we spotted ArrayList dirs = new ArrayList(); // For anything else not directly held in as a POIFS directory @@ -392,7 +387,9 @@ public class ExtractorFactory { dirs.add(entry); } } - } catch(FileNotFoundException e) {} + } catch(FileNotFoundException e) { + // ignored here + } } else if(ext instanceof PowerPointExtractor) { // Tricky, not stored directly in poifs // TODO @@ -415,23 +412,23 @@ public class ExtractorFactory { } ArrayList e = new ArrayList(); - for(int i=0; i 200 + ); + + // Word + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 + ); + + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 + ); + + // Publisher + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 + ); + + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) + instanceof OutlookTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt))); + fail(); + } catch(IOException e) { + // Good + } + } + @Test public void testPackage() throws Exception { // Excel @@ -721,13 +805,13 @@ public class TestExtractorFactory { assertEquals(6, embeds.length); int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; - for(int i=0; i 20); + for (POITextExtractor embed : embeds) { + assertTrue(embed.getText().length() > 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + if (embed instanceof PowerPointExtractor) numPpt++; + else if (embed instanceof ExcelExtractor) numXls++; + else if (embed instanceof WordExtractor) numWord++; + else if (embed instanceof OutlookTextExtactor) numMsg++; } assertEquals(2, numPpt); assertEquals(2, numXls); @@ -742,12 +826,12 @@ public class TestExtractorFactory { numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; assertEquals(4, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + for (POITextExtractor embed : embeds) { + assertTrue(embed.getText().length() > 20); + if (embed instanceof PowerPointExtractor) numPpt++; + else if (embed instanceof ExcelExtractor) numXls++; + else if (embed instanceof WordExtractor) numWord++; + else if (embed instanceof OutlookTextExtactor) numMsg++; } assertEquals(1, numPpt); assertEquals(2, numXls); @@ -762,13 +846,13 @@ public class TestExtractorFactory { numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; assertEquals(3, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; - else if(embeds[i] instanceof XWPFWordExtractor) numWordX++; + for (POITextExtractor embed : embeds) { + assertTrue(embed.getText().length() > 20); + if (embed instanceof PowerPointExtractor) numPpt++; + else if (embed instanceof ExcelExtractor) numXls++; + else if (embed instanceof WordExtractor) numWord++; + else if (embed instanceof OutlookTextExtactor) numMsg++; + else if (embed instanceof XWPFWordExtractor) numWordX++; } assertEquals(1, numPpt); assertEquals(1, numXls); @@ -784,12 +868,12 @@ public class TestExtractorFactory { numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; assertEquals(1, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + for (POITextExtractor embed : embeds) { + assertTrue(embed.getText().length() > 20); + if (embed instanceof PowerPointExtractor) numPpt++; + else if (embed instanceof ExcelExtractor) numXls++; + else if (embed instanceof WordExtractor) numWord++; + else if (embed instanceof OutlookTextExtactor) numMsg++; } assertEquals(0, numPpt); assertEquals(0, numXls); @@ -804,12 +888,12 @@ public class TestExtractorFactory { numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; assertEquals(1, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; + for (POITextExtractor embed : embeds) { + assertTrue(embed.getText().length() > 20); + if (embed instanceof PowerPointExtractor) numPpt++; + else if (embed instanceof ExcelExtractor) numXls++; + else if (embed instanceof WordExtractor) numWord++; + else if (embed instanceof OutlookTextExtactor) numMsg++; } assertEquals(0, numPpt); assertEquals(0, numXls); @@ -923,11 +1007,24 @@ public class TestExtractorFactory { * "No supported documents found in the OLE2 stream" */ @Test - public void a() throws Exception { + public void bug59074() throws Exception { try { ExtractorFactory.createExtractor( POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); fail("Old excel formats not supported via ExtractorFactory"); - } catch (OldExcelFormatException e) {} + } catch (OldExcelFormatException e) { + // expected here + } + } + + @Test + public void testGetEmbeddedFromXMLExtractor() { + try { + // currently not implemented + ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null); + fail("Unsupported currently"); + } catch (IllegalStateException e) { + // expected here + } } }