Add PublisherTextExtractor support to ExtractorFactory

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-11 14:55:43 +00:00
parent 55ae8cd33d
commit 6e97a360a3
4 changed files with 43 additions and 0 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.7-SNAPSHOT" date="2010-??-??"> <release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">Add PublisherTextExtractor support to ExtractorFactory</action>
<action dev="POI-DEVELOPERS" type="add">Add XSLF support for text extraction from tables</action> <action dev="POI-DEVELOPERS" type="add">Add XSLF support for text extraction from tables</action>
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action> <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action> <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>

View File

@ -31,6 +31,7 @@ import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.datatypes.AttachmentChunks;
@ -142,6 +143,9 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) { if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs); return new VisioTextExtractor(poifsDir, fs);
} }
if(entry.getName().equals("Quill")) {
return new PublisherTextExtractor(poifsDir, fs);
}
if( if(
entry.getName().equals("__substg1.0_1000001E") || entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_1000001F") || entry.getName().equals("__substg1.0_1000001F") ||

View File

@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
@ -62,6 +63,8 @@ public class TestExtractorFactory extends TestCase {
private File msgEmb; private File msgEmb;
private File vsd; private File vsd;
private File pub;
protected void setUp() throws Exception { protected void setUp() throws Exception {
super.setUp(); super.setUp();
@ -86,6 +89,9 @@ public class TestExtractorFactory extends TestCase {
POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd"); vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
pub = pubTests.getFile("Simple.pub");
POIDataSamples olTests = POIDataSamples.getHSMFInstance(); POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg"); msg = olTests.getFile("quick.msg");
msgEmb = olTests.getFile("attachment_test_msg.msg"); msgEmb = olTests.getFile("attachment_test_msg.msg");
@ -169,6 +175,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(vsd).getText().length() > 50 ExtractorFactory.createExtractor(vsd).getText().length() > 50
); );
// Publisher
assertTrue(
ExtractorFactory.createExtractor(pub)
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(pub).getText().length() > 50
);
// Outlook msg // Outlook msg
assertTrue( assertTrue(
ExtractorFactory.createExtractor(msg) ExtractorFactory.createExtractor(msg)
@ -248,6 +263,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
); );
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(pub))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50
);
// Outlook msg // Outlook msg
assertTrue( assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg)) ExtractorFactory.createExtractor(new FileInputStream(msg))
@ -302,6 +326,15 @@ public class TestExtractorFactory extends TestCase {
assertTrue( assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
); );
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
);
// Outlook msg // Outlook msg
assertTrue( assertTrue(
@ -426,6 +459,7 @@ public class TestExtractorFactory extends TestCase {
assertEquals(1, numWord); assertEquals(1, numWord);
// TODO - PowerPoint // TODO - PowerPoint
// TODO - Publisher
// TODO - Visio // TODO - Visio
} }
} }

View File

@ -26,6 +26,7 @@ import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit; import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
@ -39,6 +40,9 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
super(doc); super(doc);
this.doc = doc; this.doc = doc;
} }
public PublisherTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(dir, fs));
}
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs)); this(new HPBFDocument(fs));
} }