Add embeded (attachment) support to the outlook text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-01-08 16:44:08 +00:00
parent 98cea49eb5
commit 5ad8301c2a
4 changed files with 74 additions and 11 deletions

View File

@ -34,7 +34,8 @@
<changes> <changes>
<release version="3.7-SNAPSHOT" date="2010-??-??"> <release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action> <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action> <action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action> <action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action> <action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>

View File

@ -16,6 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.extractor; package org.apache.poi.extractor;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
@ -139,9 +142,14 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) { if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs); return new VisioTextExtractor(poifsDir, fs);
} }
if(entry.getName().equals("__substg1.0_1000001E") || if(
entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") || entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0037001E")) { entry.getName().equals("__substg1.0_0047001F") ||
entry.getName().equals("__substg1.0_0037001E") ||
entry.getName().equals("__substg1.0_0037001F")
) {
return new OutlookTextExtactor(poifsDir, fs); return new OutlookTextExtactor(poifsDir, fs);
} }
} }
@ -157,8 +165,12 @@ public class ExtractorFactory {
* {@link POITextExtractor} for each embeded file. * {@link POITextExtractor} for each embeded file.
*/ */
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// Find all the embeded directories // All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>(); ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem(); POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) { if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@ -189,20 +201,44 @@ public class ExtractorFactory {
} else if(ext instanceof PowerPointExtractor) { } else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs // Tricky, not stored directly in poifs
// TODO // TODO
} else if(ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
if(attachment.attachData != null) {
byte[] data = attachment.attachData.getValue();
nonPOIFS.add( new ByteArrayInputStream(data) );
}
}
} }
// Create the extractors // Create the extractors
if(dirs == null || dirs.size() == 0) { if(
(dirs == null || dirs.size() == 0) &&
(nonPOIFS == null || nonPOIFS.size() == 0)
){
return new POITextExtractor[0]; return new POITextExtractor[0];
} }
POITextExtractor[] te = new POITextExtractor[dirs.size()]; ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for(int i=0; i<te.length; i++) { for(int i=0; i<dirs.size(); i++) {
te[i] = createExtractor( e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem() (DirectoryNode)dirs.get(i), ext.getFileSystem()
); ) );
} }
return te; for(int i=0; i<nonPOIFS.size(); i++) {
try {
e.add( createExtractor(nonPOIFS.get(i)) );
} catch(IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
} catch(XmlException xe) {
throw new IOException(xe.getMessage());
} catch(OpenXML4JException oe) {
throw new IOException(oe.getMessage());
}
}
return e.toArray(new POITextExtractor[e.size()]);
} }
/** /**

View File

@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
private File pptx; private File pptx;
private File msg; private File msg;
private File msgEmb;
private File vsd; private File vsd;
protected void setUp() throws Exception { protected void setUp() throws Exception {
@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
POIDataSamples olTests = POIDataSamples.getHSMFInstance(); POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg"); msg = olTests.getFile("quick.msg");
msgEmb = olTests.getFile("attachment_test_msg.msg");
} }
public void testFile() throws Exception { public void testFile() throws Exception {
@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
assertEquals(1, numPpt); assertEquals(1, numPpt);
assertEquals(2, numXls); assertEquals(2, numXls);
assertEquals(1, numWord); assertEquals(1, numWord);
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0;
assertEquals(1, embeds.length);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
else if(embeds[i] instanceof ExcelExtractor) numXls++;
else if(embeds[i] instanceof WordExtractor) numWord++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
// TODO - PowerPoint // TODO - PowerPoint
// TODO - Visio // TODO - Visio
// TODO - Outlook
} }
} }

View File

@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
this(new MAPIMessage(inp)); this(new MAPIMessage(inp));
} }
/**
* Returns the underlying MAPI message
*/
public MAPIMessage getMAPIMessage() {
return (MAPIMessage)document;
}
/** /**
* Outputs something a little like a RFC822 email * Outputs something a little like a RFC822 email
*/ */