Wire up the new HSMFTextExtactor to the ExtractorFactory

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-08 16:14:27 +00:00
parent bd2f63c721
commit cefe4e1d28
4 changed files with 220 additions and 167 deletions

View File

@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@ -138,6 +139,11 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
if(entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0037001E")) {
return new HSMFTextExtactor(poifsDir, fs);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}

View File

@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -57,6 +58,7 @@ public class TestExtractorFactory extends TestCase {
private File ppt;
private File pptx;
private File msg;
private File vsd;
protected void setUp() throws Exception {
@ -81,6 +83,9 @@ public class TestExtractorFactory extends TestCase {
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
}
public void testFile() throws Exception {
@ -161,6 +166,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(vsd).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(msg)
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(msg).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(txt);
@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg))
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(new FileInputStream(txt));
@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
@ -375,5 +407,6 @@ public class TestExtractorFactory extends TestCase {
// TODO - PowerPoint
// TODO - Visio
// TODO - Outlook
}
}

View File

@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.hsmf.parsers.POIFSChunkParser;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
this(new POIFSFileSystem(in));
}
/**
* Constructor for reading MSG Files from an input stream.
* Constructor for reading MSG Files from a POIFS filesystem
* @param in
* @throws IOException
*/
public MAPIMessage(POIFSFileSystem fs) throws IOException {
super(fs);
this(fs.getRoot(), fs);
}
/**
* Constructor for reading MSG Files from a certain
* point within a POIFS filesystem
* @param in
* @throws IOException
*/
public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
super(poifsDir, fs);
// Grab all the chunks
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
// Grab interesting bits
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();

View File

@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class HSMFTextExtactor extends POIOLE2TextExtractor {
public HSMFTextExtactor(MAPIMessage msg) {
super(msg);
}
public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
this(new MAPIMessage(poifsDir, fs));
}
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
this(new MAPIMessage(fs));
}