diff --git a/src/java/org/apache/poi/hssf/record/DimensionsRecord.java b/src/java/org/apache/poi/hssf/record/DimensionsRecord.java index e326b5cb6..1525d58c6 100644 --- a/src/java/org/apache/poi/hssf/record/DimensionsRecord.java +++ b/src/java/org/apache/poi/hssf/record/DimensionsRecord.java @@ -20,6 +20,8 @@ package org.apache.poi.hssf.record; import org.apache.poi.util.LittleEndianOutput; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Title: Dimensions Record
@@ -32,6 +34,9 @@ import org.apache.poi.util.LittleEndianOutput; */ public final class DimensionsRecord extends StandardRecord implements Cloneable { + + private static final POILogger logger = POILogFactory.getLogger(DimensionsRecord.class); + public final static short sid = 0x200; private int field_1_first_row; private int field_2_last_row; // plus 1 @@ -50,6 +55,11 @@ public final class DimensionsRecord extends StandardRecord implements Cloneable field_3_first_col = in.readShort(); field_4_last_col = in.readShort(); field_5_zero = in.readShort(); + //POI-61045 -- in practice, there can be an extra 2 bytes + if (in.available() == 2) { + logger.log(POILogger.INFO, "DimensionsRecord has extra 2 bytes."); + in.readShort(); + } } /** diff --git a/src/java/org/apache/poi/hssf/record/FormatRecord.java b/src/java/org/apache/poi/hssf/record/FormatRecord.java index 955c52c22..575f709fb 100644 --- a/src/java/org/apache/poi/hssf/record/FormatRecord.java +++ b/src/java/org/apache/poi/hssf/record/FormatRecord.java @@ -18,7 +18,10 @@ package org.apache.poi.hssf.record; import org.apache.poi.util.HexDump; +import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianOutput; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; import org.apache.poi.util.StringUtil; /** @@ -28,6 +31,9 @@ import org.apache.poi.util.StringUtil; * REFERENCE: PG 317 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2) */ public final class FormatRecord extends StandardRecord implements Cloneable { + + private static final POILogger logger = POILogFactory.getLogger(FormatRecord.class); + public final static short sid = 0x041E; private final int field_1_index_code; @@ -52,9 +58,9 @@ public final class FormatRecord extends StandardRecord implements Cloneable { field_3_hasMultibyte = (in.readByte() & 0x01) != 0; if (field_3_hasMultibyte) { - field_4_formatstring = in.readUnicodeLEString(field_3_unicode_len); + field_4_formatstring = readStringCommon(in, field_3_unicode_len, false); } else { - field_4_formatstring = in.readCompressedUnicode(field_3_unicode_len); + field_4_formatstring = readStringCommon(in, field_3_unicode_len, true); } } @@ -113,4 +119,55 @@ public final class FormatRecord extends StandardRecord implements Cloneable { public FormatRecord clone() { return new FormatRecord(this); } + + private static String readStringCommon(RecordInputStream ris, int requestedLength, boolean pIsCompressedEncoding) { + //custom copy of ris.readUnicodeLEString to allow for extra bytes at the end + + // Sanity check to detect garbage string lengths + if (requestedLength < 0 || requestedLength > 0x100000) { // 16 million chars? + throw new IllegalArgumentException("Bad requested string length (" + requestedLength + ")"); + } + char[] buf = null; + boolean isCompressedEncoding = pIsCompressedEncoding; + int availableChars = isCompressedEncoding ? ris.remaining() : ris.remaining() / LittleEndianConsts.SHORT_SIZE; + //everything worked out. Great! + int remaining = ris.remaining(); + if (requestedLength == availableChars) { + buf = new char[requestedLength]; + } else { + //sometimes in older Excel 97 .xls files, + //the requested length is wrong. + //Read all available characters. + buf = new char[availableChars]; + } + for (int i = 0; i < buf.length; i++) { + char ch; + if (isCompressedEncoding) { + ch = (char) ris.readUByte(); + } else { + ch = (char) ris.readShort(); + } + buf[i] = ch; + } + + //TIKA-2154's file shows that even in a unicode string + //there can be a remaining byte (without proper final '00') + //that should be read as a byte + if (ris.available() == 1) { + char[] tmp = new char[buf.length+1]; + System.arraycopy(buf, 0, tmp, 0, buf.length); + tmp[buf.length] = (char)ris.readUByte(); + buf = tmp; + } + + if (ris.available() > 0) { + logger.log(POILogger.INFO, "FormatRecord has "+ris.available()+" unexplained bytes. Silently skipping"); + //swallow what's left + while (ris.available() > 0) { + ris.readByte(); + } + } + return new String(buf); + } + } diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index 2953a5b1a..1a67ec53a 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -17,11 +17,11 @@ package org.apache.poi.hssf.extractor; +import static org.apache.poi.POITestCase.assertContains; +import static org.apache.poi.POITestCase.assertStartsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.apache.poi.POITestCase.assertContains; -import static org.apache.poi.POITestCase.assertStartsWith; import java.io.File; import java.io.IOException; @@ -388,4 +388,13 @@ public final class TestExcelExtractor { assertNotNull(extractor.getText()); extractor.close(); } + + @Test + public void test61045() throws IOException { + //bug 61045. File is govdocs1 626534 + ExcelExtractor extractor = createExtractor("61045_govdocs1_626534.xls"); + String txt = extractor.getText(); + assertContains(txt, "NONBUSINESS"); + } + } diff --git a/test-data/spreadsheet/61045_govdocs1_626534.xls b/test-data/spreadsheet/61045_govdocs1_626534.xls new file mode 100644 index 000000000..e285403fe Binary files /dev/null and b/test-data/spreadsheet/61045_govdocs1_626534.xls differ