diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java index c82635bc3..87cccfa76 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.model; +import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.NotImplemented; @@ -40,11 +41,19 @@ public class OldTextPiece extends TextPiece { public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) { super(start, end, text, pd); this.rawBytes = text; - if (end < start) { - throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); - } } + @Override + protected void validateLengths(int start, int end, int length, PieceDescriptor pd) { + //things are still wonky with Big5 char/byte length mapping + //sometimes working w/ Java 8 but not w/ Java 7! + //for now, if we're dealing w/ Big5 don't bother checking + if (pd.getCharset() != null && + CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) { + return; + } + super.validateLengths(start, end, length, pd); + } /** * @return nothing, ever. Always throws an UnsupportedOperationException * @throws UnsupportedOperationException @@ -56,6 +65,7 @@ public class OldTextPiece extends TextPiece { } + @Override public StringBuilder getStringBuilder() { return (StringBuilder) _buf; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index 2a63bda16..b383cbcfb 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -60,14 +60,17 @@ public class TextPiece extends PropertyNode { // Validate int textLength = ((CharSequence) _buf).length(); - if (end - start != textLength) { - throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!"); - } + validateLengths(start, end, textLength, pd); if (end < start) { throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); } } + protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) { + if (end - start != textLength) { + throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!"); + } + } /** * Create the StringBuilder from the text and unicode flag */ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java index bfe22605a..bf355959a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -226,6 +226,26 @@ public final class TestHWPFOldDocument extends HWPFTestCase { assertContains(txt, "also maintain");//this is at a critical juncture assertContains(txt, "which are available for");//this too + /* + The bytes for the following test: + 170 : 78 : x + 171 : 0 : + 172 : d : + 173 : 35 : 5 + 174 : 39 : 9 + 175 : 0 : + 176 : 2d : - + 177 : 0 : + 178 : 35 : 5 + 179 : 0 : + 180 : 35 : 5 + + Note that we are skipping over the value "5" at offset 173. + This is an apparently invalid sequence in MS's encoding scheme + + When I open the document in MSWord, I also see "\r9-55" + */ + assertContains(txt, "\n9-55 xxxxx block5"); //TODO: figure out why these two aren't passing // assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly // assertContains(txt, "We are able to");//not sure if we can get this easily?