Improve HSMF encoding guessing for 7 bit fields, and allow HSMF access to the HTML body contents in MAPIMessage

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087726 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-04-01 14:51:45 +00:00
parent 54a27f05db
commit 43d5e715d3
6 changed files with 115 additions and 29 deletions

View File

@ -33,6 +33,10 @@
</developers> </developers>
<changes> <changes>
<release version="3.8-beta3" date="2011-??-??">
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
</release>
<release version="3.8-beta2" date="2011-??-??"> <release version="3.8-beta2" date="2011-??-??">
<action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action> <action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
<action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action> <action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>

View File

@ -176,6 +176,16 @@ public class MAPIMessage extends POIDocument {
return getStringFromChunk(mainChunks.textBodyChunk); return getStringFromChunk(mainChunks.textBodyChunk);
} }
/**
* Gets the html body of this Outlook Message, if this email
* contains a html version.
* @return The string representation of the 'html' version of the body, if available.
* @throws ChunkNotFoundException
*/
public String getHmtlBody() throws ChunkNotFoundException {
return getStringFromChunk(mainChunks.htmlBodyChunk);
}
/** /**
* Gets the subject line of the Outlook Message * Gets the subject line of the Outlook Message
* @throws ChunkNotFoundException * @throws ChunkNotFoundException
@ -331,7 +341,42 @@ public class MAPIMessage extends POIDocument {
if(m.matches()) { if(m.matches()) {
// Found it! Tell all the string chunks // Found it! Tell all the string chunks
String charset = m.group(1); String charset = m.group(1);
set7BitEncoding(charset);
return;
}
}
}
} catch(ChunkNotFoundException e) {}
// Nothing suitable in the headers, try HTML
try {
String html = getHmtlBody();
// Look for a content type in the meta headers
Pattern p = Pattern.compile(
"<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
);
Matcher m = p.matcher(html);
if(m.find()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
set7BitEncoding(charset);
return;
}
} catch(ChunkNotFoundException e) {}
}
/**
* Many messages store their strings as unicode, which is
* nice and easy. Some use one-byte encodings for their
* strings, but don't easily store the encoding anywhere
* in the file!
* If you know what the encoding is of your file, you can
* use this method to set the 7 bit encoding for all
* the non unicode strings in the file.
* @see #guess7BitEncoding()
*/
public void set7BitEncoding(String charset) {
for(Chunk c : mainChunks.getAll()) { for(Chunk c : mainChunks.getAll()) {
if(c instanceof StringChunk) { if(c instanceof StringChunk) {
((StringChunk)c).set7BitEncoding(charset); ((StringChunk)c).set7BitEncoding(charset);
@ -350,10 +395,6 @@ public class MAPIMessage extends POIDocument {
} }
} }
} }
}
}
} catch(ChunkNotFoundException e) {}
}
/** /**
* Returns all the headers, one entry per line * Returns all the headers, one entry per line

View File

@ -37,6 +37,8 @@ public final class Chunks implements ChunkGroup {
public StringChunk messageClass; public StringChunk messageClass;
/** BODY Chunk, for plain/text messages */ /** BODY Chunk, for plain/text messages */
public StringChunk textBodyChunk; public StringChunk textBodyChunk;
/** BODY Html Chunk, for html messages */
public StringChunk htmlBodyChunk;
/** Subject link chunk, in plain/text */ /** Subject link chunk, in plain/text */
public StringChunk subjectChunk; public StringChunk subjectChunk;
/** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */ /** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
@ -117,6 +119,10 @@ public final class Chunks implements ChunkGroup {
else if(chunk.getChunkId() == MAPIProperty.BODY.id) { else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
textBodyChunk = (StringChunk)chunk; textBodyChunk = (StringChunk)chunk;
} }
else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id &&
chunk instanceof StringChunk) {
htmlBodyChunk = (StringChunk)chunk;
}
// And add to the main list // And add to the main list
allChunks.add(chunk); allChunks.add(chunk);

View File

@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil;
* A Chunk made up of a single string. * A Chunk made up of a single string.
*/ */
public class StringChunk extends Chunk { public class StringChunk extends Chunk {
private static final String DEFAULT_ENCODING = "CP1252";
private String encoding7Bit = DEFAULT_ENCODING;
private String value; private String value;
private String encoding7Bit = "CP1252"; /** Only kept around for 7 bit strings */
private byte[] rawValue;
/** /**
* Creates a String Chunk. * Creates a String Chunk.
@ -66,15 +69,23 @@ public class StringChunk extends Chunk {
*/ */
public void set7BitEncoding(String encoding) { public void set7BitEncoding(String encoding) {
this.encoding7Bit = encoding; this.encoding7Bit = encoding;
// Re-read the String if we're a 7 bit one
if(type == Types.ASCII_STRING) {
parseString(rawValue);
}
} }
public void readValue(InputStream value) throws IOException { public void readValue(InputStream value) throws IOException {
String tmpValue;
byte[] data = IOUtils.toByteArray(value); byte[] data = IOUtils.toByteArray(value);
parseString(data);
}
private void parseString(byte[] data) {
String tmpValue;
switch(type) { switch(type) {
case Types.ASCII_STRING: case Types.ASCII_STRING:
tmpValue = parseAs7BitData(data, encoding7Bit); tmpValue = parseAs7BitData(data, encoding7Bit);
this.rawValue = data;
break; break;
case Types.UNICODE_STRING: case Types.UNICODE_STRING:
tmpValue = StringUtil.getFromUnicodeLE(data); tmpValue = StringUtil.getFromUnicodeLE(data);
@ -121,7 +132,7 @@ public class StringChunk extends Chunk {
* and returns the string that that yields. * and returns the string that that yields.
*/ */
protected static String parseAs7BitData(byte[] data) { protected static String parseAs7BitData(byte[] data) {
return parseAs7BitData(data, "CP1252"); return parseAs7BitData(data, DEFAULT_ENCODING);
} }
/** /**
* Parses as non-unicode, supposedly 7 bit data * Parses as non-unicode, supposedly 7 bit data

View File

@ -35,6 +35,7 @@ public final class TestBasics extends TestCase {
private MAPIMessage attachments; private MAPIMessage attachments;
private MAPIMessage noRecipientAddress; private MAPIMessage noRecipientAddress;
private MAPIMessage cyrillic; private MAPIMessage cyrillic;
private MAPIMessage chinese;
/** /**
* Initialize this test, load up the blank.msg mapi message. * Initialize this test, load up the blank.msg mapi message.
@ -48,6 +49,7 @@ public final class TestBasics extends TestCase {
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg")); attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg")); noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg")); cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg"));
} }
/** /**
@ -195,5 +197,27 @@ public final class TestBasics extends TestCase {
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding()); assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
// Override it, check it's taken
cyrillic.set7BitEncoding("UTF-8");
assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
// Check with a file that has no headers
try {
chinese.getHeaders();
fail("File doesn't have headers!");
} catch(ChunkNotFoundException e) {}
String html = chinese.getHmtlBody();
assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5"));
// Defaults to CP1251
assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
// But after guessing goes to the correct one, Big 5
chinese.guess7BitEncoding();
assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
} }
} }

Binary file not shown.