Add a text extractor to HSMF for simpler extraction of text from .msg files
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897242 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a6e7575999
commit
bd2f63c721
@ -34,6 +34,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Add a text extractor to HSMF for simpler extraction of text from .msg files</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
|
||||
|
@ -21,9 +21,11 @@ import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||
import org.apache.poi.hsmf.datatypes.ChunkGroup;
|
||||
import org.apache.poi.hsmf.datatypes.Chunks;
|
||||
@ -42,9 +44,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
*
|
||||
* [MS-OXCMSG]: Message and Attachment Object Protocol Specification
|
||||
*/
|
||||
public class MAPIMessage {
|
||||
private POIFSFileSystem fs;
|
||||
|
||||
public class MAPIMessage extends POIDocument {
|
||||
private Chunks mainChunks;
|
||||
private NameIdChunks nameIdChunks;
|
||||
private RecipientChunks recipientChunks;
|
||||
@ -55,7 +55,8 @@ public class MAPIMessage {
|
||||
*
|
||||
*/
|
||||
public MAPIMessage() {
|
||||
//TODO make writing possible
|
||||
// TODO - make writing possible
|
||||
super(new POIFSFileSystem());
|
||||
}
|
||||
|
||||
|
||||
@ -82,10 +83,10 @@ public class MAPIMessage {
|
||||
* @throws IOException
|
||||
*/
|
||||
public MAPIMessage(POIFSFileSystem fs) throws IOException {
|
||||
this.fs = fs;
|
||||
super(fs);
|
||||
|
||||
// Grab all the chunks
|
||||
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(this.fs);
|
||||
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
|
||||
|
||||
// Grab interesting bits
|
||||
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
|
||||
@ -249,4 +250,12 @@ public class MAPIMessage {
|
||||
public AttachmentChunks[] getAttachmentFiles() {
|
||||
return attachmentChunks;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Note - not yet supported, sorry.
|
||||
*/
|
||||
public void write(OutputStream out) throws IOException {
|
||||
throw new UnsupportedOperationException("Writing isn't yet supported for HSMF, sorry");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,74 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hsmf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
public class HSMFTextExtactor extends POIOLE2TextExtractor {
|
||||
public HSMFTextExtactor(MAPIMessage msg) {
|
||||
super(msg);
|
||||
}
|
||||
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
|
||||
this(new MAPIMessage(fs));
|
||||
}
|
||||
public HSMFTextExtactor(InputStream inp) throws IOException {
|
||||
this(new MAPIMessage(inp));
|
||||
}
|
||||
|
||||
/**
|
||||
* Outputs something a little like a RFC822 email
|
||||
*/
|
||||
public String getText() {
|
||||
MAPIMessage msg = (MAPIMessage)document;
|
||||
StringBuffer s = new StringBuffer();
|
||||
|
||||
try {
|
||||
s.append("From: " + msg.getDisplayFrom() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
s.append("To: " + msg.getDisplayTo() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
if(msg.getDisplayCC().length() > 0)
|
||||
s.append("CC: " + msg.getDisplayCC() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
if(msg.getDisplayBCC().length() > 0)
|
||||
s.append("BCC: " + msg.getDisplayBCC() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
|
||||
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
s.append("Subject: " + msg.getSubject() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
try {
|
||||
s.append("\n" + msg.getTextBody() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hsmf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Tests to verify that the text extractor works
|
||||
*/
|
||||
public final class TestHSMFTextExtractor extends TestCase {
|
||||
private POIDataSamples samples;
|
||||
|
||||
public TestHSMFTextExtractor() throws IOException {
|
||||
samples = POIDataSamples.getHSMFInstance();
|
||||
}
|
||||
|
||||
private void assertContains(String haystack, String needle) {
|
||||
if(haystack.indexOf(needle) > -1) {
|
||||
return;
|
||||
}
|
||||
fail("'" + needle + "' wasn't found in '" + haystack + "'");
|
||||
}
|
||||
|
||||
public void testQuick() throws Exception {
|
||||
POIFSFileSystem simple = new POIFSFileSystem(
|
||||
new FileInputStream(samples.getFile("quick.msg"))
|
||||
);
|
||||
MAPIMessage msg = new MAPIMessage(simple);
|
||||
|
||||
HSMFTextExtactor ext = new HSMFTextExtactor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Kevin Roast\n");
|
||||
assertContains(text, "To: Kevin Roast\n");
|
||||
assertEquals(-1, text.indexOf("CC:"));
|
||||
assertEquals(-1, text.indexOf("BCC:"));
|
||||
assertContains(text, "Subject: Test the content transformer\n");
|
||||
assertContains(text, "Date: Thu, 14 Jun 2007 09:42:55\n");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
MAPIMessage msg = new MAPIMessage(new POIFSFileSystem(
|
||||
new FileInputStream(samples.getFile("simple_test_msg.msg"))
|
||||
));
|
||||
|
||||
HSMFTextExtactor ext = new HSMFTextExtactor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Travis Ferguson\n");
|
||||
assertContains(text, "To: travis@overwrittenstack.com\n");
|
||||
assertEquals(-1, text.indexOf("CC:"));
|
||||
assertEquals(-1, text.indexOf("BCC:"));
|
||||
assertContains(text, "Subject: test message\n");
|
||||
assertEquals(-1, text.indexOf("Date:"));
|
||||
assertContains(text, "This is a test message.");
|
||||
}
|
||||
|
||||
public void testConstructors() throws Exception {
|
||||
String inp = (new HSMFTextExtactor(new FileInputStream(
|
||||
samples.getFile("simple_test_msg.msg")
|
||||
)).getText());
|
||||
String poifs = (new HSMFTextExtactor(new POIFSFileSystem(new FileInputStream(
|
||||
samples.getFile("simple_test_msg.msg")
|
||||
))).getText());
|
||||
String mapi = (new HSMFTextExtactor(new MAPIMessage(new FileInputStream(
|
||||
samples.getFile("simple_test_msg.msg")
|
||||
))).getText());
|
||||
|
||||
assertEquals(inp, poifs);
|
||||
assertEquals(inp, mapi);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user