From dd1116a1e3277ee99182ac95955d32175beba780 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 26 Jun 2013 18:46:37 +0000 Subject: [PATCH] Have MAPIMessage try the codepage properties to get the ascii encoding, before falling back to the existing logic around content type parsing git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1497038 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/org/apache/poi/hsmf/MAPIMessage.java | 48 +++++++++++++++---- .../org/apache/poi/hsmf/TestBasics.java | 4 +- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index dba19bc86..6cc7b6fbb 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -22,6 +22,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Calendar; @@ -40,6 +41,7 @@ import org.apache.poi.hsmf.datatypes.Chunks; import org.apache.poi.hsmf.datatypes.MAPIProperty; import org.apache.poi.hsmf.datatypes.NameIdChunks; import org.apache.poi.hsmf.datatypes.PropertyValue; +import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue; import org.apache.poi.hsmf.datatypes.RecipientChunks; import org.apache.poi.hsmf.datatypes.RecipientChunks.RecipientChunksSorter; @@ -50,6 +52,9 @@ import org.apache.poi.hsmf.parsers.POIFSChunkParser; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.CodePageUtil; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Reads an Outlook MSG File in and provides hooks into its data structure. @@ -60,6 +65,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * [MS-OXCMSG]: Message and Attachment Object Protocol Specification */ public class MAPIMessage extends POIDocument { + /** For logging problems we spot with the file */ + private POILogger logger = POILogFactory.getLogger(MAPIMessage.class); + private Chunks mainChunks; private NameIdChunks nameIdChunks; private RecipientChunks[] recipientChunks; @@ -356,19 +364,39 @@ public class MAPIMessage extends POIDocument { } /** - * Many messages store their strings as unicode, which is + * Tries to identify the correct encoding for 7-bit (non-unicode) + * strings in the file. + *

Many messages store their strings as unicode, which is * nice and easy. Some use one-byte encodings for their - * strings, but don't easily store the encoding anywhere - * in the file! - * This method looks at the headers for the message, and - * tries to use these to guess the correct encoding for - * your file. - * Bug #49441 has more on why this is needed - * - * TODO Try to also use PR_MESSAGE_CODEPAGE and PR_INTERNET_CPID - * Would need to refactor some of the codepage support in HPSF first + * strings, but don't always store the encoding anywhere + * helpful in the file.

+ *

This method checks for codepage properties, and failing that + * looks at the headers for the message, and uses these to + * guess the correct encoding for your file.

+ *

Bug #49441 has more on why this is needed

*/ public void guess7BitEncoding() { + // First choice is a codepage property + for (MAPIProperty prop : new MAPIProperty[] { + MAPIProperty.MESSAGE_CODEPAGE, + MAPIProperty.INTERNET_CPID + }) { + List val = mainChunks.getProperties().get(prop); + if (val != null && val.size() > 0) { + int codepage = ((LongPropertyValue)val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + set7BitEncoding(encoding); + return; + } catch(UnsupportedEncodingException e) { + logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, + " set for the message via ", prop, ", ignoring"); + } + } + } + + + // Second choice is a charset on a content type header try { String[] headers = getHeaders(); if(headers != null && headers.length > 0) { diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java index c250c0e41..a6e468767 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java @@ -229,8 +229,8 @@ public final class TestBasics extends TestCase { // Defaults to CP1251 assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); - // But after guessing goes to the correct one, Big 5 + // But after guessing goes to the correct one, cp950 (Windows Traditional Chinese) chinese.guess7BitEncoding(); - assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); + assertEquals("cp950", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); } }