Add PublisherTextExtractor support to ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
55ae8cd33d
commit
6e97a360a3
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Add PublisherTextExtractor support to ExtractorFactory</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Add XSLF support for text extraction from tables</action>
|
<action dev="POI-DEVELOPERS" type="add">Add XSLF support for text extraction from tables</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
|
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
||||||
|
@ -31,6 +31,7 @@ import org.apache.poi.POITextExtractor;
|
|||||||
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLDocument;
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
|
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
import org.apache.poi.hsmf.MAPIMessage;
|
import org.apache.poi.hsmf.MAPIMessage;
|
||||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||||
@ -142,6 +143,9 @@ public class ExtractorFactory {
|
|||||||
if(entry.getName().equals("VisioDocument")) {
|
if(entry.getName().equals("VisioDocument")) {
|
||||||
return new VisioTextExtractor(poifsDir, fs);
|
return new VisioTextExtractor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
|
if(entry.getName().equals("Quill")) {
|
||||||
|
return new PublisherTextExtractor(poifsDir, fs);
|
||||||
|
}
|
||||||
if(
|
if(
|
||||||
entry.getName().equals("__substg1.0_1000001E") ||
|
entry.getName().equals("__substg1.0_1000001E") ||
|
||||||
entry.getName().equals("__substg1.0_1000001F") ||
|
entry.getName().equals("__substg1.0_1000001F") ||
|
||||||
|
@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
|
|||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
|
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
@ -62,6 +63,8 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
private File msgEmb;
|
private File msgEmb;
|
||||||
|
|
||||||
private File vsd;
|
private File vsd;
|
||||||
|
|
||||||
|
private File pub;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
@ -86,6 +89,9 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||||
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
||||||
|
|
||||||
|
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
||||||
|
pub = pubTests.getFile("Simple.pub");
|
||||||
|
|
||||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||||
msg = olTests.getFile("quick.msg");
|
msg = olTests.getFile("quick.msg");
|
||||||
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
||||||
@ -169,6 +175,15 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Publisher
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(pub)
|
||||||
|
instanceof PublisherTextExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(pub).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Outlook msg
|
// Outlook msg
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(msg)
|
ExtractorFactory.createExtractor(msg)
|
||||||
@ -248,6 +263,15 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Publisher
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(pub))
|
||||||
|
instanceof PublisherTextExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Outlook msg
|
// Outlook msg
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(new FileInputStream(msg))
|
ExtractorFactory.createExtractor(new FileInputStream(msg))
|
||||||
@ -302,6 +326,15 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Publisher
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
|
||||||
|
instanceof PublisherTextExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
||||||
|
);
|
||||||
|
|
||||||
// Outlook msg
|
// Outlook msg
|
||||||
assertTrue(
|
assertTrue(
|
||||||
@ -426,6 +459,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
|
// TODO - Publisher
|
||||||
// TODO - Visio
|
// TODO - Visio
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,7 @@ import org.apache.poi.hpbf.HPBFDocument;
|
|||||||
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
||||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||||
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
|
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -39,6 +40,9 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
|
|||||||
super(doc);
|
super(doc);
|
||||||
this.doc = doc;
|
this.doc = doc;
|
||||||
}
|
}
|
||||||
|
public PublisherTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||||
|
this(new HPBFDocument(dir, fs));
|
||||||
|
}
|
||||||
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
|
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||||
this(new HPBFDocument(fs));
|
this(new HPBFDocument(fs));
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user