208 lines
7.1 KiB
Java
208 lines
7.1 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
package org.apache.poi.hsmf.extractor;
|
|
|
|
import static org.apache.poi.util.StringUtil.startsWithIgnoreCase;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.text.SimpleDateFormat;
|
|
import java.util.Locale;
|
|
|
|
import org.apache.poi.POIOLE2TextExtractor;
|
|
import org.apache.poi.hsmf.MAPIMessage;
|
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
|
import org.apache.poi.hsmf.datatypes.StringChunk;
|
|
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
|
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|
import org.apache.poi.util.LocaleUtil;
|
|
import org.apache.poi.util.StringUtil.StringsIterator;
|
|
|
|
/**
|
|
* A text extractor for HSMF (Outlook) .msg files.
|
|
* Outputs in a format somewhat like a plain text email.
|
|
*/
|
|
public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
|
public OutlookTextExtactor(MAPIMessage msg) {
|
|
super(msg);
|
|
}
|
|
/**
|
|
* Use {@link #OutlookTextExtactor(DirectoryNode)} instead
|
|
*/
|
|
@Deprecated
|
|
public OutlookTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
|
this(new MAPIMessage(poifsDir, fs));
|
|
}
|
|
public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException {
|
|
this(new MAPIMessage(poifsDir));
|
|
}
|
|
public OutlookTextExtactor(NPOIFSFileSystem fs) throws IOException {
|
|
this(new MAPIMessage(fs));
|
|
}
|
|
public OutlookTextExtactor(InputStream inp) throws IOException {
|
|
this(new MAPIMessage(inp));
|
|
}
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
for(String filename : args) {
|
|
NPOIFSFileSystem poifs = null;
|
|
OutlookTextExtactor extractor = null;
|
|
try {
|
|
poifs = new NPOIFSFileSystem(new File(filename));
|
|
extractor = new OutlookTextExtactor(poifs);
|
|
System.out.println( extractor.getText() );
|
|
} finally {
|
|
if (extractor != null) extractor.close();
|
|
if (poifs != null) poifs.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the underlying MAPI message
|
|
*/
|
|
public MAPIMessage getMAPIMessage() {
|
|
return (MAPIMessage)document;
|
|
}
|
|
|
|
/**
|
|
* Outputs something a little like a RFC822 email
|
|
*/
|
|
public String getText() {
|
|
MAPIMessage msg = (MAPIMessage)document;
|
|
StringBuffer s = new StringBuffer();
|
|
|
|
// See if we can get a suitable encoding for any
|
|
// non unicode text in the file
|
|
msg.guess7BitEncoding();
|
|
|
|
// Off we go
|
|
StringsIterator emails;
|
|
try {
|
|
emails = new StringsIterator(
|
|
msg.getRecipientEmailAddressList()
|
|
);
|
|
} catch(ChunkNotFoundException e) {
|
|
emails = new StringsIterator(new String[0]);
|
|
}
|
|
|
|
try {
|
|
s.append("From: " + msg.getDisplayFrom() + "\n");
|
|
} catch(ChunkNotFoundException e) {}
|
|
|
|
// For To, CC and BCC, try to match the names
|
|
// up with their email addresses. Relies on the
|
|
// Recipient Chunks being in the same order as
|
|
// people in To + CC + BCC.
|
|
try {
|
|
handleEmails(s, "To", msg.getDisplayTo(), emails);
|
|
} catch(ChunkNotFoundException e) {}
|
|
try {
|
|
handleEmails(s, "CC", msg.getDisplayCC(), emails);
|
|
} catch(ChunkNotFoundException e) {}
|
|
try {
|
|
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
|
|
} catch(ChunkNotFoundException e) {}
|
|
|
|
// Date - try two ways to find it
|
|
try {
|
|
// First try via the proper chunk
|
|
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
|
|
f.setTimeZone(LocaleUtil.getUserTimeZone());
|
|
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
|
|
} catch(ChunkNotFoundException e) {
|
|
try {
|
|
// Failing that try via the raw headers
|
|
String[] headers = msg.getHeaders();
|
|
for(String header: headers) {
|
|
if(startsWithIgnoreCase(header, "date:")) {
|
|
s.append(
|
|
"Date:" +
|
|
header.substring(header.indexOf(':')+1) +
|
|
"\n"
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
} catch(ChunkNotFoundException he) {
|
|
// We can't find the date, sorry...
|
|
}
|
|
}
|
|
|
|
try {
|
|
s.append("Subject: " + msg.getSubject() + "\n");
|
|
} catch(ChunkNotFoundException e) {}
|
|
|
|
// Display attachment names
|
|
// To get the attachments, use ExtractorFactory
|
|
for(AttachmentChunks att : msg.getAttachmentFiles()) {
|
|
StringChunk name = att.getAttachLongFileName();
|
|
if (name == null) name = att.getAttachFileName();
|
|
String attName = name == null ? null : name.getValue();
|
|
|
|
if(att.getAttachMimeTag() != null &&
|
|
att.getAttachMimeTag().getValue() != null) {
|
|
attName = att.getAttachMimeTag().getValue() + " = " + attName;
|
|
}
|
|
s.append("Attachment: " + attName + "\n");
|
|
}
|
|
|
|
try {
|
|
s.append("\n" + msg.getTextBody() + "\n");
|
|
} catch(ChunkNotFoundException e) {}
|
|
|
|
return s.toString();
|
|
}
|
|
|
|
/**
|
|
* Takes a Display focused string, eg "Nick; Jim" and an iterator
|
|
* of emails, and does its best to return something like
|
|
* "Nick <nick@example.com>; Jim <jim@example.com>"
|
|
*/
|
|
protected void handleEmails(StringBuffer s, String type, String displayText, StringsIterator emails) {
|
|
if(displayText == null || displayText.length() == 0) {
|
|
return;
|
|
}
|
|
|
|
String[] names = displayText.split(";\\s*");
|
|
boolean first = true;
|
|
|
|
s.append(type + ": ");
|
|
for(String name : names) {
|
|
if(first) {
|
|
first = false;
|
|
} else {
|
|
s.append("; ");
|
|
}
|
|
|
|
s.append(name);
|
|
if(emails.hasNext()) {
|
|
String email = emails.next();
|
|
// Append the email address in <>, assuming
|
|
// the name wasn't already the email address
|
|
if(! email.equals(name)) {
|
|
s.append( " <" + email + ">");
|
|
}
|
|
}
|
|
}
|
|
s.append("\n");
|
|
}
|
|
}
|