Wire up the new HSMFTextExtactor to the ExtractorFactory

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-08 16:14:27 +00:00
parent bd2f63c721
commit cefe4e1d28
4 changed files with 220 additions and 167 deletions

View File

@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@ -138,6 +139,11 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) { if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs); return new VisioTextExtractor(poifsDir, fs);
} }
if(entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0037001E")) {
return new HSMFTextExtactor(poifsDir, fs);
}
} }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
} }

View File

@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -42,132 +43,145 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
*/ */
public class TestExtractorFactory extends TestCase { public class TestExtractorFactory extends TestCase {
private File txt; private File txt;
private File xls;
private File xlsx;
private File xltx;
private File xlsEmb;
private File doc; private File xls;
private File docx; private File xlsx;
private File dotx; private File xltx;
private File docEmb; private File xlsEmb;
private File ppt; private File doc;
private File pptx; private File docx;
private File dotx;
private File vsd; private File docEmb;
protected void setUp() throws Exception { private File ppt;
super.setUp(); private File pptx;
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
xls = ssTests.getFile("SampleSS.xls");
xlsx = ssTests.getFile("SampleSS.xlsx");
xltx = ssTests.getFile("test.xltx");
xlsEmb = ssTests.getFile("excel_with_embeded.xls");
POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); private File msg;
doc = wpTests.getFile("SampleDoc.doc"); private File vsd;
docx = wpTests.getFile("SampleDoc.docx");
dotx = wpTests.getFile("test.dotx");
docEmb = wpTests.getFile("word_with_embeded.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); protected void setUp() throws Exception {
ppt = slTests.getFile("SampleShow.ppt"); super.setUp();
pptx = slTests.getFile("SampleShow.pptx");
txt = slTests.getFile("SampleShow.txt");
POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd"); xls = ssTests.getFile("SampleSS.xls");
} xlsx = ssTests.getFile("SampleSS.xlsx");
xltx = ssTests.getFile("test.xltx");
xlsEmb = ssTests.getFile("excel_with_embeded.xls");
public void testFile() throws Exception { POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
// Excel doc = wpTests.getFile("SampleDoc.doc");
assertTrue( docx = wpTests.getFile("SampleDoc.docx");
ExtractorFactory.createExtractor(xls) dotx = wpTests.getFile("test.dotx");
instanceof ExcelExtractor docEmb = wpTests.getFile("word_with_embeded.doc");
);
assertTrue(
ExtractorFactory.createExtractor(xls).getText().length() > 200
);
assertTrue(
ExtractorFactory.createExtractor(xlsx)
instanceof XSSFExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
);
assertTrue( POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ExtractorFactory.createExtractor(xltx) ppt = slTests.getFile("SampleShow.ppt");
instanceof XSSFExcelExtractor pptx = slTests.getFile("SampleShow.pptx");
); txt = slTests.getFile("SampleShow.txt");
assertTrue(
ExtractorFactory.createExtractor(xltx).getText().contains("test")
);
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
// Word vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
assertTrue(
ExtractorFactory.createExtractor(doc) POIDataSamples olTests = POIDataSamples.getHSMFInstance();
instanceof WordExtractor msg = olTests.getFile("quick.msg");
); }
assertTrue(
ExtractorFactory.createExtractor(doc).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(docx)
instanceof XWPFWordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(docx).getText().length() > 120
);
assertTrue( public void testFile() throws Exception {
ExtractorFactory.createExtractor(dotx) // Excel
instanceof XWPFWordExtractor assertTrue(
); ExtractorFactory.createExtractor(xls)
assertTrue( instanceof ExcelExtractor
ExtractorFactory.createExtractor(dotx).getText().contains("Test") );
); assertTrue(
ExtractorFactory.createExtractor(xls).getText().length() > 200
);
// PowerPoint assertTrue(
assertTrue( ExtractorFactory.createExtractor(xlsx)
ExtractorFactory.createExtractor(ppt) instanceof XSSFExcelExtractor
instanceof PowerPointExtractor );
); assertTrue(
assertTrue( ExtractorFactory.createExtractor(xlsx).getText().length() > 200
ExtractorFactory.createExtractor(ppt).getText().length() > 120 );
);
assertTrue(
assertTrue( ExtractorFactory.createExtractor(xltx)
ExtractorFactory.createExtractor(pptx) instanceof XSSFExcelExtractor
instanceof XSLFPowerPointExtractor );
); assertTrue(
assertTrue( ExtractorFactory.createExtractor(xltx).getText().contains("test")
ExtractorFactory.createExtractor(pptx).getText().length() > 120 );
);
// Visio // Word
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsd) ExtractorFactory.createExtractor(doc)
instanceof VisioTextExtractor instanceof WordExtractor
); );
assertTrue( assertTrue(
ExtractorFactory.createExtractor(vsd).getText().length() > 50 ExtractorFactory.createExtractor(doc).getText().length() > 120
); );
// Text assertTrue(
try { ExtractorFactory.createExtractor(docx)
ExtractorFactory.createExtractor(txt); instanceof XWPFWordExtractor
fail(); );
} catch(IllegalArgumentException e) { assertTrue(
// Good ExtractorFactory.createExtractor(docx).getText().length() > 120
} );
assertTrue(
ExtractorFactory.createExtractor(dotx)
instanceof XWPFWordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(dotx).getText().contains("Test")
);
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(ppt)
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(ppt).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(pptx)
instanceof XSLFPowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(pptx).getText().length() > 120
);
// Visio
assertTrue(
ExtractorFactory.createExtractor(vsd)
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(vsd).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(msg)
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(msg).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch(IllegalArgumentException e) {
// Good
}
} }
public void testInputStream() throws Exception { public void testInputStream() throws Exception {
@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
); );
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg))
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
);
// Text // Text
try { try {
ExtractorFactory.createExtractor(new FileInputStream(txt)); ExtractorFactory.createExtractor(new FileInputStream(txt));
@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
); );
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
instanceof HSMFTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
// Text // Text
try { try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
@ -323,57 +355,58 @@ public class TestExtractorFactory extends TestCase {
} }
} }
/** /**
* Test embeded docs text extraction. For now, only * Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones * does poifs embeded, but will do ooxml ones
* at some point. * at some point.
*/ */
public void testEmbeded() throws Exception { public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext; POIOLE2TextExtractor ext;
POITextExtractor[] embeds; POITextExtractor[] embeds;
// No embedings // No embedings
ext = (POIOLE2TextExtractor) ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xls); ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length); assertEquals(0, embeds.length);
// Excel
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length); // Excel
int numWord = 0, numXls = 0, numPpt = 0; ext = (POIOLE2TextExtractor)
for(int i=0; i<embeds.length; i++) { ExtractorFactory.createExtractor(xlsEmb);
assertTrue(embeds[i].getText().length() > 20); embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
if(embeds[i] instanceof PowerPointExtractor) numPpt++; assertEquals(6, embeds.length);
else if(embeds[i] instanceof ExcelExtractor) numXls++; int numWord = 0, numXls = 0, numPpt = 0;
else if(embeds[i] instanceof WordExtractor) numWord++; for(int i=0; i<embeds.length; i++) {
} assertTrue(embeds[i].getText().length() > 20);
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
// Word if(embeds[i] instanceof PowerPointExtractor) numPpt++;
ext = (POIOLE2TextExtractor) else if(embeds[i] instanceof ExcelExtractor) numXls++;
ExtractorFactory.createExtractor(docEmb); else if(embeds[i] instanceof WordExtractor) numWord++;
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); }
assertEquals(2, numPpt);
numWord = 0; numXls = 0; numPpt = 0; assertEquals(2, numXls);
assertEquals(4, embeds.length); assertEquals(2, numWord);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
else if(embeds[i] instanceof ExcelExtractor) numXls++;
else if(embeds[i] instanceof WordExtractor) numWord++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
// TODO - PowerPoint // Word
// TODO - Visio ext = (POIOLE2TextExtractor)
} ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0;
assertEquals(4, embeds.length);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
else if(embeds[i] instanceof ExcelExtractor) numXls++;
else if(embeds[i] instanceof WordExtractor) numWord++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
// TODO - PowerPoint
// TODO - Visio
// TODO - Outlook
}
} }

View File

@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk; import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.hsmf.parsers.POIFSChunkParser; import org.apache.poi.hsmf.parsers.POIFSChunkParser;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
this(new POIFSFileSystem(in)); this(new POIFSFileSystem(in));
} }
/** /**
* Constructor for reading MSG Files from an input stream. * Constructor for reading MSG Files from a POIFS filesystem
* @param in * @param in
* @throws IOException * @throws IOException
*/ */
public MAPIMessage(POIFSFileSystem fs) throws IOException { public MAPIMessage(POIFSFileSystem fs) throws IOException {
super(fs); this(fs.getRoot(), fs);
}
/**
* Constructor for reading MSG Files from a certain
* point within a POIFS filesystem
* @param in
* @throws IOException
*/
public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
super(poifsDir, fs);
// Grab all the chunks // Grab all the chunks
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs); ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
// Grab interesting bits // Grab interesting bits
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>(); ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();

View File

@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class HSMFTextExtactor extends POIOLE2TextExtractor { public class HSMFTextExtactor extends POIOLE2TextExtractor {
public HSMFTextExtactor(MAPIMessage msg) { public HSMFTextExtactor(MAPIMessage msg) {
super(msg); super(msg);
} }
public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
this(new MAPIMessage(poifsDir, fs));
}
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException { public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
this(new MAPIMessage(fs)); this(new MAPIMessage(fs));
} }