Wire up the new HSMFTextExtactor to the ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bd2f63c721
commit
cefe4e1d28
@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
@ -138,6 +139,11 @@ public class ExtractorFactory {
|
||||
if(entry.getName().equals("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("__substg1.0_1000001E") ||
|
||||
entry.getName().equals("__substg1.0_0047001E") ||
|
||||
entry.getName().equals("__substg1.0_0037001E")) {
|
||||
return new HSMFTextExtactor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
@ -42,132 +43,145 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
*/
|
||||
public class TestExtractorFactory extends TestCase {
|
||||
|
||||
private File txt;
|
||||
|
||||
private File xls;
|
||||
private File xlsx;
|
||||
private File xltx;
|
||||
private File xlsEmb;
|
||||
private File txt;
|
||||
|
||||
private File doc;
|
||||
private File docx;
|
||||
private File dotx;
|
||||
private File docEmb;
|
||||
private File xls;
|
||||
private File xlsx;
|
||||
private File xltx;
|
||||
private File xlsEmb;
|
||||
|
||||
private File ppt;
|
||||
private File pptx;
|
||||
|
||||
private File vsd;
|
||||
private File doc;
|
||||
private File docx;
|
||||
private File dotx;
|
||||
private File docEmb;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
||||
xls = ssTests.getFile("SampleSS.xls");
|
||||
xlsx = ssTests.getFile("SampleSS.xlsx");
|
||||
xltx = ssTests.getFile("test.xltx");
|
||||
xlsEmb = ssTests.getFile("excel_with_embeded.xls");
|
||||
private File ppt;
|
||||
private File pptx;
|
||||
|
||||
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||
doc = wpTests.getFile("SampleDoc.doc");
|
||||
docx = wpTests.getFile("SampleDoc.docx");
|
||||
dotx = wpTests.getFile("test.dotx");
|
||||
docEmb = wpTests.getFile("word_with_embeded.doc");
|
||||
private File msg;
|
||||
private File vsd;
|
||||
|
||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
ppt = slTests.getFile("SampleShow.ppt");
|
||||
pptx = slTests.getFile("SampleShow.pptx");
|
||||
txt = slTests.getFile("SampleShow.txt");
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
||||
}
|
||||
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
||||
xls = ssTests.getFile("SampleSS.xls");
|
||||
xlsx = ssTests.getFile("SampleSS.xlsx");
|
||||
xltx = ssTests.getFile("test.xltx");
|
||||
xlsEmb = ssTests.getFile("excel_with_embeded.xls");
|
||||
|
||||
public void testFile() throws Exception {
|
||||
// Excel
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xls)
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xls).getText().length() > 200
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xlsx)
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
|
||||
);
|
||||
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||
doc = wpTests.getFile("SampleDoc.doc");
|
||||
docx = wpTests.getFile("SampleDoc.docx");
|
||||
dotx = wpTests.getFile("test.dotx");
|
||||
docEmb = wpTests.getFile("word_with_embeded.doc");
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xltx)
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xltx).getText().contains("test")
|
||||
);
|
||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
ppt = slTests.getFile("SampleShow.ppt");
|
||||
pptx = slTests.getFile("SampleShow.pptx");
|
||||
txt = slTests.getFile("SampleShow.txt");
|
||||
|
||||
|
||||
// Word
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc)
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(docx)
|
||||
instanceof XWPFWordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(docx).getText().length() > 120
|
||||
);
|
||||
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
||||
vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
|
||||
|
||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||
msg = olTests.getFile("quick.msg");
|
||||
}
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(dotx)
|
||||
instanceof XWPFWordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(dotx).getText().contains("Test")
|
||||
);
|
||||
public void testFile() throws Exception {
|
||||
// Excel
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xls)
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xls).getText().length() > 200
|
||||
);
|
||||
|
||||
// PowerPoint
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt)
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pptx)
|
||||
instanceof XSLFPowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pptx).getText().length() > 120
|
||||
);
|
||||
|
||||
// Visio
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd)
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(txt);
|
||||
fail();
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
}
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xlsx)
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xltx)
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(xltx).getText().contains("test")
|
||||
);
|
||||
|
||||
|
||||
// Word
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc)
|
||||
instanceof WordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(docx)
|
||||
instanceof XWPFWordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(docx).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(dotx)
|
||||
instanceof XWPFWordExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(dotx).getText().contains("Test")
|
||||
);
|
||||
|
||||
// PowerPoint
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt)
|
||||
instanceof PowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(ppt).getText().length() > 120
|
||||
);
|
||||
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pptx)
|
||||
instanceof XSLFPowerPointExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(pptx).getText().length() > 120
|
||||
);
|
||||
|
||||
// Visio
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd)
|
||||
instanceof VisioTextExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||
);
|
||||
|
||||
// Outlook msg
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(msg)
|
||||
instanceof HSMFTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(msg).getText().length() > 50
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(txt);
|
||||
fail();
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
}
|
||||
}
|
||||
|
||||
public void testInputStream() throws Exception {
|
||||
@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
|
||||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
||||
);
|
||||
|
||||
// Outlook msg
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new FileInputStream(msg))
|
||||
instanceof HSMFTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(new FileInputStream(txt));
|
||||
@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Outlook msg
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
|
||||
instanceof HSMFTextExtactor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
||||
@ -323,57 +355,58 @@ public class TestExtractorFactory extends TestCase {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test embeded docs text extraction. For now, only
|
||||
* does poifs embeded, but will do ooxml ones
|
||||
* at some point.
|
||||
*/
|
||||
public void testEmbeded() throws Exception {
|
||||
POIOLE2TextExtractor ext;
|
||||
POITextExtractor[] embeds;
|
||||
/**
|
||||
* Test embeded docs text extraction. For now, only
|
||||
* does poifs embeded, but will do ooxml ones
|
||||
* at some point.
|
||||
*/
|
||||
public void testEmbeded() throws Exception {
|
||||
POIOLE2TextExtractor ext;
|
||||
POITextExtractor[] embeds;
|
||||
|
||||
// No embedings
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
|
||||
// Excel
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xlsEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
// No embedings
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
|
||||
assertEquals(6, embeds.length);
|
||||
int numWord = 0, numXls = 0, numPpt = 0;
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
// Excel
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xlsEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
}
|
||||
assertEquals(2, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(2, numWord);
|
||||
assertEquals(6, embeds.length);
|
||||
int numWord = 0, numXls = 0, numPpt = 0;
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
|
||||
// Word
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(docEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0;
|
||||
assertEquals(4, embeds.length);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
}
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(1, numWord);
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
}
|
||||
assertEquals(2, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(2, numWord);
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Visio
|
||||
}
|
||||
// Word
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(docEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0;
|
||||
assertEquals(4, embeds.length);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
}
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(1, numWord);
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Visio
|
||||
// TODO - Outlook
|
||||
}
|
||||
}
|
||||
|
@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
|
||||
import org.apache.poi.hsmf.datatypes.StringChunk;
|
||||
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
||||
import org.apache.poi.hsmf.parsers.POIFSChunkParser;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
|
||||
this(new POIFSFileSystem(in));
|
||||
}
|
||||
/**
|
||||
* Constructor for reading MSG Files from an input stream.
|
||||
* Constructor for reading MSG Files from a POIFS filesystem
|
||||
* @param in
|
||||
* @throws IOException
|
||||
*/
|
||||
public MAPIMessage(POIFSFileSystem fs) throws IOException {
|
||||
super(fs);
|
||||
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
/**
|
||||
* Constructor for reading MSG Files from a certain
|
||||
* point within a POIFS filesystem
|
||||
* @param in
|
||||
* @throws IOException
|
||||
*/
|
||||
public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||
super(poifsDir, fs);
|
||||
|
||||
// Grab all the chunks
|
||||
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
|
||||
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
|
||||
|
||||
// Grab interesting bits
|
||||
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
|
||||
|
@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
public class HSMFTextExtactor extends POIOLE2TextExtractor {
|
||||
public HSMFTextExtactor(MAPIMessage msg) {
|
||||
super(msg);
|
||||
}
|
||||
public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||
this(new MAPIMessage(poifsDir, fs));
|
||||
}
|
||||
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
|
||||
this(new MAPIMessage(fs));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user