From 9412d467f3bd7dfb2f2da8588db6585d7783376c Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 11 Apr 2017 17:07:04 +0000 Subject: [PATCH] bug 50955 - incorporate info from the DocumentSummaryInformation for guessing the encoding. Back off to the old method if DocSummInfo is not available. Thanks to Andreas Beeker for recommending this direction. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1791002 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/hwpf/HWPFOldDocument.java | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 3ec11023e..370577288 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -19,8 +19,12 @@ package org.apache.poi.hwpf; import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.Section; import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwpf.model.ComplexFileTable; import org.apache.poi.hwpf.model.FontTable; @@ -188,7 +192,32 @@ public class HWPFOldDocument extends HWPFDocumentCore { * @return The detected Charset from the old font table */ private Charset guessCodePage(OldFontTable fontTable) { - + //try to get it out of the overall document summary information + DocumentSummaryInformation summaryInformation = getDocumentSummaryInformation(); + if (summaryInformation != null) { + CustomProperties customProperties = summaryInformation.getCustomProperties(); + if (customProperties != null) { + int codePage = customProperties.getCodepage(); + try { + return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); + } catch (UnsupportedEncodingException e) { + //swallow + } + } + //for now, try to get first valid code page in a valid section + for (Section section : summaryInformation.getSections()) { + if (section.getOffset() < 0) { + continue; + } + int codePage = section.getCodepage(); + try { + return Charset.forName(CodePageUtil.codepageToEncoding(codePage)); + } catch (UnsupportedEncodingException e) { + //swallow + } + } + } + //if that still doesn't work, pick the first non-default non symbol charset for (OldFfn oldFfn : fontTable.getFontNames()) { HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); if (wmfCharset != null &&