diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 2925ace08..5ee7b31f2 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Add PublisherTextExtractor support to ExtractorFactory Add XSLF support for text extraction from tables Support attachments as embeded documents within the new OutlookTextExtractor Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 55d8499f5..203c59657 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -31,6 +31,7 @@ import org.apache.poi.POITextExtractor; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.datatypes.AttachmentChunks; @@ -142,6 +143,9 @@ public class ExtractorFactory { if(entry.getName().equals("VisioDocument")) { return new VisioTextExtractor(poifsDir, fs); } + if(entry.getName().equals("Quill")) { + return new PublisherTextExtractor(poifsDir, fs); + } if( entry.getName().equals("__substg1.0_1000001E") || entry.getName().equals("__substg1.0_1000001F") || diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index d327c5582..c127427a2 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; import org.apache.poi.POIDataSamples; import org.apache.poi.hdgf.extractor.VisioTextExtractor; +import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; @@ -62,6 +63,8 @@ public class TestExtractorFactory extends TestCase { private File msgEmb; private File vsd; + + private File pub; protected void setUp() throws Exception { super.setUp(); @@ -86,6 +89,9 @@ public class TestExtractorFactory extends TestCase { POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd"); + POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); + pub = pubTests.getFile("Simple.pub"); + POIDataSamples olTests = POIDataSamples.getHSMFInstance(); msg = olTests.getFile("quick.msg"); msgEmb = olTests.getFile("attachment_test_msg.msg"); @@ -169,6 +175,15 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(vsd).getText().length() > 50 ); + // Publisher + assertTrue( + ExtractorFactory.createExtractor(pub) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(pub).getText().length() > 50 + ); + // Outlook msg assertTrue( ExtractorFactory.createExtractor(msg) @@ -248,6 +263,15 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 ); + // Publisher + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pub)) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50 + ); + // Outlook msg assertTrue( ExtractorFactory.createExtractor(new FileInputStream(msg)) @@ -302,6 +326,15 @@ public class TestExtractorFactory extends TestCase { assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 ); + + // Publisher + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) + instanceof PublisherTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 + ); // Outlook msg assertTrue( @@ -426,6 +459,7 @@ public class TestExtractorFactory extends TestCase { assertEquals(1, numWord); // TODO - PowerPoint + // TODO - Publisher // TODO - Visio } } diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java index 8187ab124..5a1c1f43d 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java @@ -26,6 +26,7 @@ import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.model.qcbits.QCBit; import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -39,6 +40,9 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { super(doc); this.doc = doc; } + public PublisherTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HPBFDocument(dir, fs)); + } public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { this(new HPBFDocument(fs)); }