From 5ad8301c2ab54710f8f39db70ce060f90c54f11d Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 8 Jan 2010 16:44:08 +0000 Subject: [PATCH] Add embeded (attachment) support to the outlook text extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 3 +- .../poi/extractor/ExtractorFactory.java | 54 +++++++++++++++---- .../poi/extractor/TestExtractorFactory.java | 21 +++++++- .../hsmf/extractor/OutlookTextExtactor.java | 7 +++ 4 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 15b375081..83042ae6b 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,7 +34,8 @@ - Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files + Support attachments as embeded documents within the new OutlookTextExtractor + Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files Some improvements to HSMF parsing of .msg files Initialise the link type of HSSFHyperLink, so that getType() on it works 48425 - improved performance of DateUtil.isCellDateFormatted() diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 7657635e7..55d8499f5 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.extractor; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -139,9 +142,14 @@ public class ExtractorFactory { if(entry.getName().equals("VisioDocument")) { return new VisioTextExtractor(poifsDir, fs); } - if(entry.getName().equals("__substg1.0_1000001E") || + if( + entry.getName().equals("__substg1.0_1000001E") || + entry.getName().equals("__substg1.0_1000001F") || entry.getName().equals("__substg1.0_0047001E") || - entry.getName().equals("__substg1.0_0037001E")) { + entry.getName().equals("__substg1.0_0047001F") || + entry.getName().equals("__substg1.0_0037001E") || + entry.getName().equals("__substg1.0_0037001F") + ) { return new OutlookTextExtactor(poifsDir, fs); } } @@ -157,8 +165,12 @@ public class ExtractorFactory { * {@link POITextExtractor} for each embeded file. */ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { - // Find all the embeded directories + // All the embded directories we spotted ArrayList dirs = new ArrayList(); + // For anything else not directly held in as a POIFS directory + ArrayList nonPOIFS = new ArrayList(); + + // Find all the embeded directories POIFSFileSystem fs = ext.getFileSystem(); if(fs == null) { throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); @@ -189,20 +201,44 @@ public class ExtractorFactory { } else if(ext instanceof PowerPointExtractor) { // Tricky, not stored directly in poifs // TODO + } else if(ext instanceof OutlookTextExtactor) { + // Stored in the Attachment blocks + MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); + for(AttachmentChunks attachment : msg.getAttachmentFiles()) { + if(attachment.attachData != null) { + byte[] data = attachment.attachData.getValue(); + nonPOIFS.add( new ByteArrayInputStream(data) ); + } + } } // Create the extractors - if(dirs == null || dirs.size() == 0) { + if( + (dirs == null || dirs.size() == 0) && + (nonPOIFS == null || nonPOIFS.size() == 0) + ){ return new POITextExtractor[0]; } - POITextExtractor[] te = new POITextExtractor[dirs.size()]; - for(int i=0; i e = new ArrayList(); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + } + assertEquals(0, numPpt); + assertEquals(0, numXls); + assertEquals(1, numWord); // TODO - PowerPoint // TODO - Visio - // TODO - Outlook } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 54a2ddda6..a6ada5bb9 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { this(new MAPIMessage(inp)); } + /** + * Returns the underlying MAPI message + */ + public MAPIMessage getMAPIMessage() { + return (MAPIMessage)document; + } + /** * Outputs something a little like a RFC822 email */