Add embeded (attachment) support to the outlook text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
98cea49eb5
commit
5ad8301c2a
@ -34,7 +34,8 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
<action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
|
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
|
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
|
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.extractor;
|
package org.apache.poi.extractor;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
|
|||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
|
import org.apache.poi.hsmf.MAPIMessage;
|
||||||
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
@ -139,9 +142,14 @@ public class ExtractorFactory {
|
|||||||
if(entry.getName().equals("VisioDocument")) {
|
if(entry.getName().equals("VisioDocument")) {
|
||||||
return new VisioTextExtractor(poifsDir, fs);
|
return new VisioTextExtractor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
if(entry.getName().equals("__substg1.0_1000001E") ||
|
if(
|
||||||
|
entry.getName().equals("__substg1.0_1000001E") ||
|
||||||
|
entry.getName().equals("__substg1.0_1000001F") ||
|
||||||
entry.getName().equals("__substg1.0_0047001E") ||
|
entry.getName().equals("__substg1.0_0047001E") ||
|
||||||
entry.getName().equals("__substg1.0_0037001E")) {
|
entry.getName().equals("__substg1.0_0047001F") ||
|
||||||
|
entry.getName().equals("__substg1.0_0037001E") ||
|
||||||
|
entry.getName().equals("__substg1.0_0037001F")
|
||||||
|
) {
|
||||||
return new OutlookTextExtactor(poifsDir, fs);
|
return new OutlookTextExtactor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -157,8 +165,12 @@ public class ExtractorFactory {
|
|||||||
* {@link POITextExtractor} for each embeded file.
|
* {@link POITextExtractor} for each embeded file.
|
||||||
*/
|
*/
|
||||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||||
// Find all the embeded directories
|
// All the embded directories we spotted
|
||||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||||
|
// For anything else not directly held in as a POIFS directory
|
||||||
|
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||||
|
|
||||||
|
// Find all the embeded directories
|
||||||
POIFSFileSystem fs = ext.getFileSystem();
|
POIFSFileSystem fs = ext.getFileSystem();
|
||||||
if(fs == null) {
|
if(fs == null) {
|
||||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||||
@ -189,20 +201,44 @@ public class ExtractorFactory {
|
|||||||
} else if(ext instanceof PowerPointExtractor) {
|
} else if(ext instanceof PowerPointExtractor) {
|
||||||
// Tricky, not stored directly in poifs
|
// Tricky, not stored directly in poifs
|
||||||
// TODO
|
// TODO
|
||||||
|
} else if(ext instanceof OutlookTextExtactor) {
|
||||||
|
// Stored in the Attachment blocks
|
||||||
|
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
|
||||||
|
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
|
||||||
|
if(attachment.attachData != null) {
|
||||||
|
byte[] data = attachment.attachData.getValue();
|
||||||
|
nonPOIFS.add( new ByteArrayInputStream(data) );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the extractors
|
// Create the extractors
|
||||||
if(dirs == null || dirs.size() == 0) {
|
if(
|
||||||
|
(dirs == null || dirs.size() == 0) &&
|
||||||
|
(nonPOIFS == null || nonPOIFS.size() == 0)
|
||||||
|
){
|
||||||
return new POITextExtractor[0];
|
return new POITextExtractor[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
POITextExtractor[] te = new POITextExtractor[dirs.size()];
|
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
|
||||||
for(int i=0; i<te.length; i++) {
|
for(int i=0; i<dirs.size(); i++) {
|
||||||
te[i] = createExtractor(
|
e.add( createExtractor(
|
||||||
(DirectoryNode)dirs.get(i), ext.getFileSystem()
|
(DirectoryNode)dirs.get(i), ext.getFileSystem()
|
||||||
);
|
) );
|
||||||
}
|
}
|
||||||
return te;
|
for(int i=0; i<nonPOIFS.size(); i++) {
|
||||||
|
try {
|
||||||
|
e.add( createExtractor(nonPOIFS.get(i)) );
|
||||||
|
} catch(IllegalArgumentException ie) {
|
||||||
|
// Ignore, just means it didn't contain
|
||||||
|
// a format we support as yet
|
||||||
|
} catch(XmlException xe) {
|
||||||
|
throw new IOException(xe.getMessage());
|
||||||
|
} catch(OpenXML4JException oe) {
|
||||||
|
throw new IOException(oe.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return e.toArray(new POITextExtractor[e.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
private File pptx;
|
private File pptx;
|
||||||
|
|
||||||
private File msg;
|
private File msg;
|
||||||
|
private File msgEmb;
|
||||||
|
|
||||||
private File vsd;
|
private File vsd;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
|
|
||||||
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
||||||
msg = olTests.getFile("quick.msg");
|
msg = olTests.getFile("quick.msg");
|
||||||
|
msgEmb = olTests.getFile("attachment_test_msg.msg");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFile() throws Exception {
|
public void testFile() throws Exception {
|
||||||
@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
assertEquals(1, numPpt);
|
assertEquals(1, numPpt);
|
||||||
assertEquals(2, numXls);
|
assertEquals(2, numXls);
|
||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
|
|
||||||
|
// Outlook
|
||||||
|
ext = (OutlookTextExtactor)
|
||||||
|
ExtractorFactory.createExtractor(msgEmb);
|
||||||
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
|
numWord = 0; numXls = 0; numPpt = 0;
|
||||||
|
assertEquals(1, embeds.length);
|
||||||
|
for(int i=0; i<embeds.length; i++) {
|
||||||
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
}
|
||||||
|
assertEquals(0, numPpt);
|
||||||
|
assertEquals(0, numXls);
|
||||||
|
assertEquals(1, numWord);
|
||||||
|
|
||||||
// TODO - PowerPoint
|
// TODO - PowerPoint
|
||||||
// TODO - Visio
|
// TODO - Visio
|
||||||
// TODO - Outlook
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
|||||||
this(new MAPIMessage(inp));
|
this(new MAPIMessage(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the underlying MAPI message
|
||||||
|
*/
|
||||||
|
public MAPIMessage getMAPIMessage() {
|
||||||
|
return (MAPIMessage)document;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Outputs something a little like a RFC822 email
|
* Outputs something a little like a RFC822 email
|
||||||
*/
|
*/
|
||||||
|
Loading…
Reference in New Issue
Block a user