diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 370577288..cfc010206 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -44,6 +44,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.NotImplemented; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; import org.apache.poi.util.StringUtil; /** @@ -52,6 +54,9 @@ import org.apache.poi.util.StringUtil; */ public class HWPFOldDocument extends HWPFDocumentCore { + private static final POILogger logger = POILogFactory + .getLogger( HWPFOldDocument.class ); + private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252; private OldTextPieceTable tpt; @@ -110,6 +115,7 @@ public class HWPFOldDocument extends HWPFDocumentCore { //if there was a problem with the guessed charset and the length of the //textpiece, back off to win1252. This is effectively what we used to do. tp = buildTextPiece(StringUtil.WIN_1252); + logger.log(POILogger.WARN, "Error with "+guessedCharset +". Backing off to Windows-1252"); } tpt.add(tp); @@ -181,9 +187,9 @@ public class HWPFOldDocument extends HWPFDocumentCore { /** - * Take the first codepage that is not default, ansi or symbol. - * Ideally, we'd want to track fonts with runs, but we don't yet - * know how to do that. + * Try to get the code page from various areas of the document. + * Start with the DocumentSummaryInformation, back off to the section info, + * finally try the charset information from the font table. * * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi * appears here. @@ -198,26 +204,30 @@ public class HWPFOldDocument extends HWPFDocumentCore { CustomProperties customProperties = summaryInformation.getCustomProperties(); if (customProperties != null) { int codePage = customProperties.getCodepage(); - try { - return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); - } catch (UnsupportedEncodingException e) { - //swallow + if (codePage > -1) { + try { + return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); + } catch (UnsupportedEncodingException e) { + //swallow + } } } - //for now, try to get first valid code page in a valid section + //If that didn't work, for now, try to get first valid code page in a valid section for (Section section : summaryInformation.getSections()) { if (section.getOffset() < 0) { continue; } int codePage = section.getCodepage(); - try { - return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); - } catch (UnsupportedEncodingException e) { - //swallow + if (codePage > -1) { + try { + return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); + } catch (UnsupportedEncodingException e) { + //swallow + } } } } - //if that still doesn't work, pick the first non-default non symbol charset + //if that still doesn't work, pick the first non-default, non-symbol charset for (OldFfn oldFfn : fontTable.getFontNames()) { HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); if (wmfCharset != null && @@ -227,6 +237,8 @@ public class HWPFOldDocument extends HWPFDocumentCore { return wmfCharset.getCharset(); } } + logger.log(POILogger.WARN, "Couldn't find a defined charset; backing off to cp1252"); + //if all else fails return DEFAULT_CHARSET; } @@ -282,8 +294,9 @@ public class HWPFOldDocument extends HWPFDocumentCore { } /** - * As a rough heuristic (total hack), read through the font table - * and take the first non-default, non-ansi, non-symbol + * As a rough heuristic (total hack), read through the HPSF, + * then read through the font table, and take the first + * non-default, non-ansi, non-symbol * font's charset and return that. * * Once we figure out how to link a font to a text piece, we should