bug 50955 - incorporate info from the DocumentSummaryInformation for

guessing the encoding.  Back off to the old method if DocSummInfo is
not available. Thanks to Andreas Beeker for recommending this direction.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1791002 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-04-11 17:07:04 +00:00
parent 44d536e4ce
commit 9412d467f3
1 changed files with 30 additions and 1 deletions

View File

@ -19,8 +19,12 @@ package org.apache.poi.hwpf;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.FontTable;
@ -188,7 +192,32 @@ public class HWPFOldDocument extends HWPFDocumentCore {
* @return The detected Charset from the old font table
*/
private Charset guessCodePage(OldFontTable fontTable) {
//try to get it out of the overall document summary information
DocumentSummaryInformation summaryInformation = getDocumentSummaryInformation();
if (summaryInformation != null) {
CustomProperties customProperties = summaryInformation.getCustomProperties();
if (customProperties != null) {
int codePage = customProperties.getCodepage();
try {
return Charset.forName(CodePageUtil.codepageToEncoding(codePage));
} catch (UnsupportedEncodingException e) {
//swallow
}
}
//for now, try to get first valid code page in a valid section
for (Section section : summaryInformation.getSections()) {
if (section.getOffset() < 0) {
continue;
}
int codePage = section.getCodepage();
try {
return Charset.forName(CodePageUtil.codepageToEncoding(codePage));
} catch (UnsupportedEncodingException e) {
//swallow
}
}
}
//if that still doesn't work, pick the first non-default non symbol charset
for (OldFfn oldFfn : fontTable.getFontNames()) {
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
if (wmfCharset != null &&