bug 50955 - try originally guessed codepoint, backoff to 1252 if that fails

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790904 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-04-11 01:30:02 +00:00
parent 1f41c31984
commit 44d536e4ce

View File

@ -96,25 +96,17 @@ public class HWPFOldDocument extends HWPFDocumentCore {
} else { } else {
// TODO Discover if these older documents can ever hold Unicode Strings? // TODO Discover if these older documents can ever hold Unicode Strings?
// (We think not, because they seem to lack a Piece table) // (We think not, because they seem to lack a Piece table)
// TODO Build the Piece Descriptor properly //
// (We have to fake it, as they don't seem to have a proper Piece table) // What we have here is a wretched hack. We need to figure out
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); // how to get the correct charset for the doc.
pd.setFilePosition(_fib.getFibBase().getFcMin()); TextPiece tp = null;
try {
// Generate a single Text Piece Table, with a single Text Piece tp = buildTextPiece(guessedCharset);
// which covers all the (8 bit only) text in the file } catch (IllegalStateException e) {
tpt = new OldTextPieceTable(); //if there was a problem with the guessed charset and the length of the
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; //textpiece, back off to win1252. This is effectively what we used to do.
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); tp = buildTextPiece(StringUtil.WIN_1252);
int numChars = textData.length;
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
numChars /= 2;
} }
TextPiece tp = new TextPiece(
0, numChars, textData, pd
);
tpt.add(tp); tpt.add(tp);
} }
@ -156,6 +148,33 @@ public class HWPFOldDocument extends HWPFDocumentCore {
} }
} }
/**
*
* @param guessedCharset charset that we think this is
* @return a new text piece
* @throws IllegalStateException if the length isn't correct
*/
private TextPiece buildTextPiece(Charset guessedCharset) throws IllegalStateException {
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
pd.setFilePosition(_fib.getFibBase().getFcMin());
// Generate a single Text Piece Table, with a single Text Piece
// which covers all the (8 bit only) text in the file
tpt = new OldTextPieceTable();
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
int numChars = textData.length;
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
numChars /= 2;
}
return new TextPiece(
0, numChars, textData, pd
);
}
/** /**
* Take the first codepage that is not default, ansi or symbol. * Take the first codepage that is not default, ansi or symbol.