From 44d536e4ce24b9c30d5d1b8cedb715c31b5682cd Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 11 Apr 2017 01:30:02 +0000 Subject: [PATCH] bug 50955 - try originally guessed codepoint, backoff to 1252 if that fails git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790904 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/hwpf/HWPFOldDocument.java | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index e5a185afe..3ec11023e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -96,25 +96,17 @@ public class HWPFOldDocument extends HWPFDocumentCore { } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) - // TODO Build the Piece Descriptor properly - // (We have to fake it, as they don't seem to have a proper Piece table) - PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); - pd.setFilePosition(_fib.getFibBase().getFcMin()); - - // Generate a single Text Piece Table, with a single Text Piece - // which covers all the (8 bit only) text in the file - tpt = new OldTextPieceTable(); - byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; - System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); - - int numChars = textData.length; - if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { - numChars /= 2; + // + // What we have here is a wretched hack. We need to figure out + // how to get the correct charset for the doc. + TextPiece tp = null; + try { + tp = buildTextPiece(guessedCharset); + } catch (IllegalStateException e) { + //if there was a problem with the guessed charset and the length of the + //textpiece, back off to win1252. This is effectively what we used to do. + tp = buildTextPiece(StringUtil.WIN_1252); } - - TextPiece tp = new TextPiece( - 0, numChars, textData, pd - ); tpt.add(tp); } @@ -156,6 +148,33 @@ public class HWPFOldDocument extends HWPFDocumentCore { } } + /** + * + * @param guessedCharset charset that we think this is + * @return a new text piece + * @throws IllegalStateException if the length isn't correct + */ + private TextPiece buildTextPiece(Charset guessedCharset) throws IllegalStateException { + PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); + pd.setFilePosition(_fib.getFibBase().getFcMin()); + + // Generate a single Text Piece Table, with a single Text Piece + // which covers all the (8 bit only) text in the file + tpt = new OldTextPieceTable(); + byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; + System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); + + int numChars = textData.length; + if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { + numChars /= 2; + } + + return new TextPiece( + 0, numChars, textData, pd + ); + + } + /** * Take the first codepage that is not default, ansi or symbol.