Support nested outlook files in ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@982334 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
514e6be1fe
commit
8dcf35452d
@ -50,7 +50,6 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
|||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.Entry;
|
import org.apache.poi.poifs.filesystem.Entry;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.xslf.XSLFSlideShow;
|
|
||||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||||
@ -289,8 +288,10 @@ public class ExtractorFactory {
|
|||||||
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
|
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
|
||||||
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
|
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
|
||||||
if(attachment.attachData != null) {
|
if(attachment.attachData != null) {
|
||||||
byte[] data = attachment.attachData.getValue();
|
byte[] data = attachment.attachData.getValue();
|
||||||
nonPOIFS.add( new ByteArrayInputStream(data) );
|
nonPOIFS.add( new ByteArrayInputStream(data) );
|
||||||
|
} else if(attachment.attachmentDirectory != null) {
|
||||||
|
dirs.add(attachment.attachmentDirectory.getDirectory());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -66,6 +66,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
|
|
||||||
private File msg;
|
private File msg;
|
||||||
private File msgEmb;
|
private File msgEmb;
|
||||||
|
private File msgEmbMsg;
|
||||||
|
|
||||||
private File vsd;
|
private File vsd;
|
||||||
|
|
||||||
@ -102,6 +103,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||||
msg = olTests.getFile("quick.msg");
|
msg = olTests.getFile("quick.msg");
|
||||||
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
||||||
|
msgEmbMsg = olTests.getFile("attachment_msg_pdf.msg");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFile() throws Exception {
|
public void testFile() throws Exception {
|
||||||
@ -534,51 +536,77 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
assertEquals(6, embeds.length);
|
assertEquals(6, embeds.length);
|
||||||
int numWord = 0, numXls = 0, numPpt = 0;
|
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
|
||||||
for(int i=0; i<embeds.length; i++) {
|
for(int i=0; i<embeds.length; i++) {
|
||||||
assertTrue(embeds[i].getText().length() > 20);
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
|
|
||||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||||
}
|
}
|
||||||
assertEquals(2, numPpt);
|
assertEquals(2, numPpt);
|
||||||
assertEquals(2, numXls);
|
assertEquals(2, numXls);
|
||||||
assertEquals(2, numWord);
|
assertEquals(2, numWord);
|
||||||
|
assertEquals(0, numMsg);
|
||||||
|
|
||||||
// Word
|
// Word
|
||||||
ext = (POIOLE2TextExtractor)
|
ext = (POIOLE2TextExtractor)
|
||||||
ExtractorFactory.createExtractor(docEmb);
|
ExtractorFactory.createExtractor(docEmb);
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0;
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||||
assertEquals(4, embeds.length);
|
assertEquals(4, embeds.length);
|
||||||
for(int i=0; i<embeds.length; i++) {
|
for(int i=0; i<embeds.length; i++) {
|
||||||
assertTrue(embeds[i].getText().length() > 20);
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||||
}
|
}
|
||||||
assertEquals(1, numPpt);
|
assertEquals(1, numPpt);
|
||||||
assertEquals(2, numXls);
|
assertEquals(2, numXls);
|
||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
|
assertEquals(0, numMsg);
|
||||||
|
|
||||||
// Outlook
|
// Outlook
|
||||||
ext = (OutlookTextExtactor)
|
ext = (OutlookTextExtactor)
|
||||||
ExtractorFactory.createExtractor(msgEmb);
|
ExtractorFactory.createExtractor(msgEmb);
|
||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
numWord = 0; numXls = 0; numPpt = 0;
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||||
assertEquals(1, embeds.length);
|
assertEquals(1, embeds.length);
|
||||||
for(int i=0; i<embeds.length; i++) {
|
for(int i=0; i<embeds.length; i++) {
|
||||||
assertTrue(embeds[i].getText().length() > 20);
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||||
}
|
}
|
||||||
assertEquals(0, numPpt);
|
assertEquals(0, numPpt);
|
||||||
assertEquals(0, numXls);
|
assertEquals(0, numXls);
|
||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
|
assertEquals(0, numMsg);
|
||||||
|
|
||||||
|
// Outlook with another outlook file in it
|
||||||
|
ext = (OutlookTextExtactor)
|
||||||
|
ExtractorFactory.createExtractor(msgEmbMsg);
|
||||||
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
||||||
|
assertEquals(1, embeds.length);
|
||||||
|
for(int i=0; i<embeds.length; i++) {
|
||||||
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||||
|
}
|
||||||
|
assertEquals(0, numPpt);
|
||||||
|
assertEquals(0, numXls);
|
||||||
|
assertEquals(0, numWord);
|
||||||
|
assertEquals(1, numMsg);
|
||||||
|
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
// TODO - Publisher
|
// TODO - Publisher
|
||||||
|
Loading…
Reference in New Issue
Block a user