Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
835efd63de
commit
65d7431a9f
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
|
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
||||||
|
@ -283,6 +283,17 @@ public class MAPIMessage extends POIDocument {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public String[] getHeaders() throws ChunkNotFoundException {
|
||||||
|
String headers = getStringFromChunk(mainChunks.messageHeaders);
|
||||||
|
if(headers == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return headers.split("\\r?\\n");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the conversation topic of the parsed Outlook Message.
|
* Gets the conversation topic of the parsed Outlook Message.
|
||||||
* This is the part of the subject line that is after the RE: and FWD:
|
* This is the part of the subject line that is after the RE: and FWD:
|
||||||
|
@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup {
|
|||||||
// 0x0050 -> 0x006F seem to be routing info or similar
|
// 0x0050 -> 0x006F seem to be routing info or similar
|
||||||
public static final int CONVERSATION_TOPIC = 0x0070;
|
public static final int CONVERSATION_TOPIC = 0x0070;
|
||||||
public static final int SENT_BY_SERVER_TYPE = 0x0075;
|
public static final int SENT_BY_SERVER_TYPE = 0x0075;
|
||||||
|
public static final int MESSAGE_HEADERS = 0x007D;
|
||||||
// RECEIVEDEMAIL = 76
|
// RECEIVEDEMAIL = 76
|
||||||
public static final int DISPLAY_TO = 0x0E04;
|
public static final int DISPLAY_TO = 0x0E04;
|
||||||
public static final int DISPLAY_FROM = 0x0C1A;
|
public static final int DISPLAY_FROM = 0x0C1A;
|
||||||
@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup {
|
|||||||
public StringChunk conversationTopic;
|
public StringChunk conversationTopic;
|
||||||
/** Type of server that the message originated from (SMTP, etc). */
|
/** Type of server that the message originated from (SMTP, etc). */
|
||||||
public StringChunk sentByServerType;
|
public StringChunk sentByServerType;
|
||||||
|
/** The email headers */
|
||||||
|
public StringChunk messageHeaders;
|
||||||
/** TODO */
|
/** TODO */
|
||||||
public MessageSubmissionChunk submissionChunk;
|
public MessageSubmissionChunk submissionChunk;
|
||||||
/** TODO */
|
/** TODO */
|
||||||
@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup {
|
|||||||
case SENT_BY_SERVER_TYPE:
|
case SENT_BY_SERVER_TYPE:
|
||||||
sentByServerType = (StringChunk)chunk;
|
sentByServerType = (StringChunk)chunk;
|
||||||
break;
|
break;
|
||||||
|
case MESSAGE_HEADERS:
|
||||||
|
messageHeaders = (StringChunk)chunk;
|
||||||
|
break;
|
||||||
case DISPLAY_TO:
|
case DISPLAY_TO:
|
||||||
displayToChunk = (StringChunk)chunk;
|
displayToChunk = (StringChunk)chunk;
|
||||||
break;
|
break;
|
||||||
|
@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
|||||||
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
|
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
|
||||||
} catch(ChunkNotFoundException e) {}
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
|
||||||
|
// Date - try two ways to find it
|
||||||
try {
|
try {
|
||||||
|
// First try via the proper chunk
|
||||||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
|
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
|
||||||
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
|
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
|
||||||
} catch(ChunkNotFoundException e) {}
|
} catch(ChunkNotFoundException e) {
|
||||||
|
try {
|
||||||
|
// Failing that try via the raw headers
|
||||||
|
String[] headers = msg.getHeaders();
|
||||||
|
for(String header: headers) {
|
||||||
|
if(header.toLowerCase().startsWith("date:")) {
|
||||||
|
s.append(
|
||||||
|
"Date:" +
|
||||||
|
header.substring(header.indexOf(':')+1) +
|
||||||
|
"\n"
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(ChunkNotFoundException he) {
|
||||||
|
// We can't find the date, sorry...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
s.append("Subject: " + msg.getSubject() + "\n");
|
s.append("Subject: " + msg.getSubject() + "\n");
|
||||||
} catch(ChunkNotFoundException e) {}
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
@ -75,6 +75,35 @@ public final class TestBasics extends TestCase {
|
|||||||
assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
|
assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test message headers
|
||||||
|
*/
|
||||||
|
public void testHeaders() throws Exception {
|
||||||
|
// Simple email first
|
||||||
|
assertEquals(26, simple.getHeaders().length);
|
||||||
|
assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
|
||||||
|
assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
|
||||||
|
assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
|
||||||
|
|
||||||
|
// Quick doesn't have them
|
||||||
|
try {
|
||||||
|
quick.getHeaders();
|
||||||
|
fail();
|
||||||
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
|
||||||
|
// Attachments doesn't have them
|
||||||
|
try {
|
||||||
|
attachments.getHeaders();
|
||||||
|
fail();
|
||||||
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
|
||||||
|
// Outlook30 has some
|
||||||
|
assertEquals(33, outlook30.getHeaders().length);
|
||||||
|
assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
|
||||||
|
assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
|
||||||
|
assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test attachments
|
* Test attachments
|
||||||
*/
|
*/
|
||||||
|
@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase {
|
|||||||
assertEquals(-1, text.indexOf("CC:"));
|
assertEquals(-1, text.indexOf("CC:"));
|
||||||
assertEquals(-1, text.indexOf("BCC:"));
|
assertEquals(-1, text.indexOf("BCC:"));
|
||||||
assertContains(text, "Subject: test message\n");
|
assertContains(text, "Subject: test message\n");
|
||||||
assertEquals(-1, text.indexOf("Date:"));
|
assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
|
||||||
assertContains(text, "This is a test message.");
|
assertContains(text, "This is a test message.");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase {
|
|||||||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||||
assertEquals(-1, text.indexOf("BCC:"));
|
assertEquals(-1, text.indexOf("BCC:"));
|
||||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||||
assertEquals(-1, text.indexOf("Date:"));
|
assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
|
||||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user