Improve HSMF encoding guessing for 7 bit fields, and allow HSMF access to the HTML body contents in MAPIMessage
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087726 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
54a27f05db
commit
43d5e715d3
@ -33,6 +33,10 @@
|
|||||||
</developers>
|
</developers>
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
|
<release version="3.8-beta3" date="2011-??-??">
|
||||||
|
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
|
||||||
|
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
|
||||||
|
</release>
|
||||||
<release version="3.8-beta2" date="2011-??-??">
|
<release version="3.8-beta2" date="2011-??-??">
|
||||||
<action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
|
<action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
|
||||||
<action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>
|
<action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>
|
||||||
|
@ -176,6 +176,16 @@ public class MAPIMessage extends POIDocument {
|
|||||||
return getStringFromChunk(mainChunks.textBodyChunk);
|
return getStringFromChunk(mainChunks.textBodyChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the html body of this Outlook Message, if this email
|
||||||
|
* contains a html version.
|
||||||
|
* @return The string representation of the 'html' version of the body, if available.
|
||||||
|
* @throws ChunkNotFoundException
|
||||||
|
*/
|
||||||
|
public String getHmtlBody() throws ChunkNotFoundException {
|
||||||
|
return getStringFromChunk(mainChunks.htmlBodyChunk);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the subject line of the Outlook Message
|
* Gets the subject line of the Outlook Message
|
||||||
* @throws ChunkNotFoundException
|
* @throws ChunkNotFoundException
|
||||||
@ -331,28 +341,59 @@ public class MAPIMessage extends POIDocument {
|
|||||||
if(m.matches()) {
|
if(m.matches()) {
|
||||||
// Found it! Tell all the string chunks
|
// Found it! Tell all the string chunks
|
||||||
String charset = m.group(1);
|
String charset = m.group(1);
|
||||||
|
set7BitEncoding(charset);
|
||||||
for(Chunk c : mainChunks.getAll()) {
|
return;
|
||||||
if(c instanceof StringChunk) {
|
|
||||||
((StringChunk)c).set7BitEncoding(charset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(Chunk c : nameIdChunks.getAll()) {
|
|
||||||
if(c instanceof StringChunk) {
|
|
||||||
((StringChunk)c).set7BitEncoding(charset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(RecipientChunks rc : recipientChunks) {
|
|
||||||
for(Chunk c : rc.getAll()) {
|
|
||||||
if(c instanceof StringChunk) {
|
|
||||||
((StringChunk)c).set7BitEncoding(charset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch(ChunkNotFoundException e) {}
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
|
||||||
|
// Nothing suitable in the headers, try HTML
|
||||||
|
try {
|
||||||
|
String html = getHmtlBody();
|
||||||
|
|
||||||
|
// Look for a content type in the meta headers
|
||||||
|
Pattern p = Pattern.compile(
|
||||||
|
"<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
|
||||||
|
);
|
||||||
|
Matcher m = p.matcher(html);
|
||||||
|
if(m.find()) {
|
||||||
|
// Found it! Tell all the string chunks
|
||||||
|
String charset = m.group(1);
|
||||||
|
set7BitEncoding(charset);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Many messages store their strings as unicode, which is
|
||||||
|
* nice and easy. Some use one-byte encodings for their
|
||||||
|
* strings, but don't easily store the encoding anywhere
|
||||||
|
* in the file!
|
||||||
|
* If you know what the encoding is of your file, you can
|
||||||
|
* use this method to set the 7 bit encoding for all
|
||||||
|
* the non unicode strings in the file.
|
||||||
|
* @see #guess7BitEncoding()
|
||||||
|
*/
|
||||||
|
public void set7BitEncoding(String charset) {
|
||||||
|
for(Chunk c : mainChunks.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(Chunk c : nameIdChunks.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(RecipientChunks rc : recipientChunks) {
|
||||||
|
for(Chunk c : rc.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,6 +37,8 @@ public final class Chunks implements ChunkGroup {
|
|||||||
public StringChunk messageClass;
|
public StringChunk messageClass;
|
||||||
/** BODY Chunk, for plain/text messages */
|
/** BODY Chunk, for plain/text messages */
|
||||||
public StringChunk textBodyChunk;
|
public StringChunk textBodyChunk;
|
||||||
|
/** BODY Html Chunk, for html messages */
|
||||||
|
public StringChunk htmlBodyChunk;
|
||||||
/** Subject link chunk, in plain/text */
|
/** Subject link chunk, in plain/text */
|
||||||
public StringChunk subjectChunk;
|
public StringChunk subjectChunk;
|
||||||
/** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
|
/** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
|
||||||
@ -117,6 +119,10 @@ public final class Chunks implements ChunkGroup {
|
|||||||
else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
|
else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
|
||||||
textBodyChunk = (StringChunk)chunk;
|
textBodyChunk = (StringChunk)chunk;
|
||||||
}
|
}
|
||||||
|
else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id &&
|
||||||
|
chunk instanceof StringChunk) {
|
||||||
|
htmlBodyChunk = (StringChunk)chunk;
|
||||||
|
}
|
||||||
|
|
||||||
// And add to the main list
|
// And add to the main list
|
||||||
allChunks.add(chunk);
|
allChunks.add(chunk);
|
||||||
|
@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil;
|
|||||||
* A Chunk made up of a single string.
|
* A Chunk made up of a single string.
|
||||||
*/
|
*/
|
||||||
public class StringChunk extends Chunk {
|
public class StringChunk extends Chunk {
|
||||||
private String value;
|
private static final String DEFAULT_ENCODING = "CP1252";
|
||||||
private String encoding7Bit = "CP1252";
|
private String encoding7Bit = DEFAULT_ENCODING;
|
||||||
|
private String value;
|
||||||
|
/** Only kept around for 7 bit strings */
|
||||||
|
private byte[] rawValue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a String Chunk.
|
* Creates a String Chunk.
|
||||||
@ -56,7 +59,7 @@ public class StringChunk extends Chunk {
|
|||||||
public String get7BitEncoding() {
|
public String get7BitEncoding() {
|
||||||
return encoding7Bit;
|
return encoding7Bit;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the Encoding that will be used to
|
* Sets the Encoding that will be used to
|
||||||
* decode any "7 bit" (non unicode) data.
|
* decode any "7 bit" (non unicode) data.
|
||||||
@ -66,25 +69,33 @@ public class StringChunk extends Chunk {
|
|||||||
*/
|
*/
|
||||||
public void set7BitEncoding(String encoding) {
|
public void set7BitEncoding(String encoding) {
|
||||||
this.encoding7Bit = encoding;
|
this.encoding7Bit = encoding;
|
||||||
|
|
||||||
|
// Re-read the String if we're a 7 bit one
|
||||||
|
if(type == Types.ASCII_STRING) {
|
||||||
|
parseString(rawValue);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void readValue(InputStream value) throws IOException {
|
public void readValue(InputStream value) throws IOException {
|
||||||
String tmpValue;
|
byte[] data = IOUtils.toByteArray(value);
|
||||||
byte[] data = IOUtils.toByteArray(value);
|
parseString(data);
|
||||||
|
}
|
||||||
|
private void parseString(byte[] data) {
|
||||||
|
String tmpValue;
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Types.ASCII_STRING:
|
case Types.ASCII_STRING:
|
||||||
tmpValue = parseAs7BitData(data, encoding7Bit);
|
tmpValue = parseAs7BitData(data, encoding7Bit);
|
||||||
break;
|
this.rawValue = data;
|
||||||
|
break;
|
||||||
case Types.UNICODE_STRING:
|
case Types.UNICODE_STRING:
|
||||||
tmpValue = StringUtil.getFromUnicodeLE(data);
|
tmpValue = StringUtil.getFromUnicodeLE(data);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up
|
// Clean up
|
||||||
this.value = tmpValue.replace("\0", "");
|
this.value = tmpValue.replace("\0", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeValue(OutputStream out) throws IOException {
|
public void writeValue(OutputStream out) throws IOException {
|
||||||
@ -121,7 +132,7 @@ public class StringChunk extends Chunk {
|
|||||||
* and returns the string that that yields.
|
* and returns the string that that yields.
|
||||||
*/
|
*/
|
||||||
protected static String parseAs7BitData(byte[] data) {
|
protected static String parseAs7BitData(byte[] data) {
|
||||||
return parseAs7BitData(data, "CP1252");
|
return parseAs7BitData(data, DEFAULT_ENCODING);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Parses as non-unicode, supposedly 7 bit data
|
* Parses as non-unicode, supposedly 7 bit data
|
||||||
|
@ -35,6 +35,7 @@ public final class TestBasics extends TestCase {
|
|||||||
private MAPIMessage attachments;
|
private MAPIMessage attachments;
|
||||||
private MAPIMessage noRecipientAddress;
|
private MAPIMessage noRecipientAddress;
|
||||||
private MAPIMessage cyrillic;
|
private MAPIMessage cyrillic;
|
||||||
|
private MAPIMessage chinese;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize this test, load up the blank.msg mapi message.
|
* Initialize this test, load up the blank.msg mapi message.
|
||||||
@ -48,6 +49,7 @@ public final class TestBasics extends TestCase {
|
|||||||
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
|
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
|
||||||
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
|
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
|
||||||
cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
|
cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
|
||||||
|
chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -195,5 +197,27 @@ public final class TestBasics extends TestCase {
|
|||||||
|
|
||||||
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
|
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
|
||||||
|
// Override it, check it's taken
|
||||||
|
cyrillic.set7BitEncoding("UTF-8");
|
||||||
|
assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
|
||||||
|
|
||||||
|
// Check with a file that has no headers
|
||||||
|
try {
|
||||||
|
chinese.getHeaders();
|
||||||
|
fail("File doesn't have headers!");
|
||||||
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
|
||||||
|
String html = chinese.getHmtlBody();
|
||||||
|
assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5"));
|
||||||
|
|
||||||
|
// Defaults to CP1251
|
||||||
|
assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
|
||||||
|
// But after guessing goes to the correct one, Big 5
|
||||||
|
chinese.guess7BitEncoding();
|
||||||
|
assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
BIN
test-data/hsmf/chinese-traditional.msg
Normal file
BIN
test-data/hsmf/chinese-traditional.msg
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user