Add embeded (attachment) support to the outlook text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
98cea49eb5
commit
5ad8301c2a
@ -34,7 +34,8 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
|
||||
|
@ -16,6 +16,7 @@
|
||||
==================================================================== */
|
||||
package org.apache.poi.extractor;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
@ -139,9 +142,14 @@ public class ExtractorFactory {
|
||||
if(entry.getName().equals("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("__substg1.0_1000001E") ||
|
||||
if(
|
||||
entry.getName().equals("__substg1.0_1000001E") ||
|
||||
entry.getName().equals("__substg1.0_1000001F") ||
|
||||
entry.getName().equals("__substg1.0_0047001E") ||
|
||||
entry.getName().equals("__substg1.0_0037001E")) {
|
||||
entry.getName().equals("__substg1.0_0047001F") ||
|
||||
entry.getName().equals("__substg1.0_0037001E") ||
|
||||
entry.getName().equals("__substg1.0_0037001F")
|
||||
) {
|
||||
return new OutlookTextExtactor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
@ -157,8 +165,12 @@ public class ExtractorFactory {
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||
// Find all the embeded directories
|
||||
// All the embded directories we spotted
|
||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||
|
||||
// Find all the embeded directories
|
||||
POIFSFileSystem fs = ext.getFileSystem();
|
||||
if(fs == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
@ -189,20 +201,44 @@ public class ExtractorFactory {
|
||||
} else if(ext instanceof PowerPointExtractor) {
|
||||
// Tricky, not stored directly in poifs
|
||||
// TODO
|
||||
} else if(ext instanceof OutlookTextExtactor) {
|
||||
// Stored in the Attachment blocks
|
||||
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
|
||||
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
|
||||
if(attachment.attachData != null) {
|
||||
byte[] data = attachment.attachData.getValue();
|
||||
nonPOIFS.add( new ByteArrayInputStream(data) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
if(dirs == null || dirs.size() == 0) {
|
||||
if(
|
||||
(dirs == null || dirs.size() == 0) &&
|
||||
(nonPOIFS == null || nonPOIFS.size() == 0)
|
||||
){
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
POITextExtractor[] te = new POITextExtractor[dirs.size()];
|
||||
for(int i=0; i<te.length; i++) {
|
||||
te[i] = createExtractor(
|
||||
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
|
||||
for(int i=0; i<dirs.size(); i++) {
|
||||
e.add( createExtractor(
|
||||
(DirectoryNode)dirs.get(i), ext.getFileSystem()
|
||||
);
|
||||
) );
|
||||
}
|
||||
return te;
|
||||
for(int i=0; i<nonPOIFS.size(); i++) {
|
||||
try {
|
||||
e.add( createExtractor(nonPOIFS.get(i)) );
|
||||
} catch(IllegalArgumentException ie) {
|
||||
// Ignore, just means it didn't contain
|
||||
// a format we support as yet
|
||||
} catch(XmlException xe) {
|
||||
throw new IOException(xe.getMessage());
|
||||
} catch(OpenXML4JException oe) {
|
||||
throw new IOException(oe.getMessage());
|
||||
}
|
||||
}
|
||||
return e.toArray(new POITextExtractor[e.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
|
||||
private File pptx;
|
||||
|
||||
private File msg;
|
||||
private File msgEmb;
|
||||
|
||||
private File vsd;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
|
||||
|
||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||
msg = olTests.getFile("quick.msg");
|
||||
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
||||
}
|
||||
|
||||
public void testFile() throws Exception {
|
||||
@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(2, numXls);
|
||||
assertEquals(1, numWord);
|
||||
|
||||
// Outlook
|
||||
ext = (OutlookTextExtactor)
|
||||
ExtractorFactory.createExtractor(msgEmb);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0;
|
||||
assertEquals(1, embeds.length);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
}
|
||||
assertEquals(0, numPpt);
|
||||
assertEquals(0, numXls);
|
||||
assertEquals(1, numWord);
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Visio
|
||||
// TODO - Outlook
|
||||
}
|
||||
}
|
||||
|
@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
||||
this(new MAPIMessage(inp));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the underlying MAPI message
|
||||
*/
|
||||
public MAPIMessage getMAPIMessage() {
|
||||
return (MAPIMessage)document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Outputs something a little like a RFC822 email
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user