From ce0fea767fa2a36890f9e07e8165011b48837cba Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 3 Aug 2010 16:06:21 +0000 Subject: [PATCH] Fix bug #49441 - Allow overriding and guessing of HSMF non-unicode string encodings git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@981947 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../src/org/apache/poi/hsmf/MAPIMessage.java | 55 ++++++++++++++++++- .../poi/hsmf/datatypes/StringChunk.java | 39 +++++++++++-- .../org/apache/poi/hsmf/TestBasics.java | 19 +++++++ 4 files changed, 106 insertions(+), 8 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 3c6ba2742..840486d61 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 49441 - Allow overriding and guessing of HSMF non-unicode string encodings 49689 - Allow the setting of user style names on newly created HSSF cell styles Make it easier to tell which content types each POIXMLTextExtractor handles 49649 - Added clone support for UserSView* and Feat* families of records diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index e04d299b1..3db17180f 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -25,10 +25,13 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Calendar; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.poi.POIDocument; import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter; +import org.apache.poi.hsmf.datatypes.Chunk; import org.apache.poi.hsmf.datatypes.ChunkGroup; import org.apache.poi.hsmf.datatypes.Chunks; import org.apache.poi.hsmf.datatypes.NameIdChunks; @@ -286,10 +289,58 @@ public class MAPIMessage extends POIDocument { return names; } - /** - * + * Many messages store their strings as unicode, which is + * nice and easy. Some use one-byte encodings for their + * strings, but don't easily store the encoding anywhere + * in the file! + * This method looks at the headers for the message, and + * tries to use these to guess the correct encoding for + * your file. + * Bug #49441 has more on why this is needed + */ + public void guess7BitEncoding() { + try { + String[] headers = getHeaders(); + if(headers == null || headers.length == 0) { + return; + } + + // Look for a content type with a charset + Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?(.*?)[\"']?"); + for(String header : headers) { + if(header.startsWith("Content-Type")) { + Matcher m = p.matcher(header); + if(m.matches()) { + // Found it! Tell all the string chunks + String charset = m.group(1); + + for(Chunk c : mainChunks.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + for(Chunk c : nameIdChunks.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + for(RecipientChunks rc : recipientChunks) { + for(Chunk c : rc.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + } + } + } + } + } catch(ChunkNotFoundException e) {} + } + + /** + * Returns all the headers, one entry per line */ public String[] getHeaders() throws ChunkNotFoundException { String headers = getStringFromChunk(mainChunks.messageHeaders); diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java index b735058dc..057bb07aa 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java @@ -30,8 +30,8 @@ import org.apache.poi.util.StringUtil; * A Chunk made up of a single string. */ public class StringChunk extends Chunk { - private String value; + private String encoding7Bit = "CP1252"; /** * Creates a String Chunk. @@ -48,13 +48,33 @@ public class StringChunk extends Chunk { super(chunkId, type); } + /** + * Returns the Encoding that will be used to + * decode any "7 bit" (non unicode) data. + * Most files default to CP1252 + */ + public String get7BitEncoding() { + return encoding7Bit; + } + + /** + * Sets the Encoding that will be used to + * decode any "7 bit" (non unicode) data. + * This doesn't appear to be stored anywhere + * specific in the file, so you may need + * to guess by looking at headers etc + */ + public void set7BitEncoding(String encoding) { + this.encoding7Bit = encoding; + } + public void readValue(InputStream value) throws IOException { String tmpValue; byte[] data = IOUtils.toByteArray(value); switch(type) { case Types.ASCII_STRING: - tmpValue = parseAs7BitData(data); + tmpValue = parseAs7BitData(data, encoding7Bit); break; case Types.UNICODE_STRING: tmpValue = StringUtil.getFromUnicodeLE(data); @@ -73,9 +93,9 @@ public class StringChunk extends Chunk { switch(type) { case Types.ASCII_STRING: try { - data = value.getBytes("CP1252"); + data = value.getBytes(encoding7Bit); } catch (UnsupportedEncodingException e) { - throw new RuntimeException("Core encoding not found, JVM broken?", e); + throw new RuntimeException("Encoding not found - " + encoding7Bit, e); } break; case Types.UNICODE_STRING: @@ -101,10 +121,17 @@ public class StringChunk extends Chunk { * and returns the string that that yields. */ protected static String parseAs7BitData(byte[] data) { + return parseAs7BitData(data, "CP1252"); + } + /** + * Parses as non-unicode, supposedly 7 bit data + * and returns the string that that yields. + */ + protected static String parseAs7BitData(byte[] data, String encoding) { try { - return new String(data, "CP1252"); + return new String(data, encoding); } catch (UnsupportedEncodingException e) { - throw new RuntimeException("Core encoding not found, JVM broken?", e); + throw new RuntimeException("Encoding not found - " + encoding, e); } } } diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java index 18a316491..bb7c87262 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java @@ -34,6 +34,7 @@ public final class TestBasics extends TestCase { private MAPIMessage outlook30; private MAPIMessage attachments; private MAPIMessage noRecipientAddress; + private MAPIMessage cyrillic; /** * Initialize this test, load up the blank.msg mapi message. @@ -46,6 +47,7 @@ public final class TestBasics extends TestCase { outlook30 = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg")); attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg")); noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg")); + cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg")); } /** @@ -177,4 +179,21 @@ public final class TestBasics extends TestCase { noRecipientAddress.setReturnNullOnMissingChunk(false); } + + /** + * We default to CP1252, but can sometimes do better + * if needed. + * This file is really CP1251, according to the person + * who submitted it in bug #49441 + */ + public void testEncoding() throws Exception { + assertEquals(2, cyrillic.getRecipientDetailsChunks().length); + assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); + assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding()); + + cyrillic.guess7BitEncoding(); + + assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); + assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding()); + } }