From 649280df8ca2ff3b7fb8ee06e3a5ca70557be65d Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 20 Jun 2017 18:11:34 +0000 Subject: [PATCH] bug 61045 -- allow for (and log!) extra bytes in FormatRecord. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1799360 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hssf/record/DimensionsRecord.java | 10 +++ .../apache/poi/hssf/record/FormatRecord.java | 61 +++++++++++++++++- .../hssf/extractor/TestExcelExtractor.java | 13 +++- .../spreadsheet/61045_govdocs1_626534.xls | Bin 0 -> 10752 bytes 4 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 test-data/spreadsheet/61045_govdocs1_626534.xls diff --git a/src/java/org/apache/poi/hssf/record/DimensionsRecord.java b/src/java/org/apache/poi/hssf/record/DimensionsRecord.java index e326b5cb6..1525d58c6 100644 --- a/src/java/org/apache/poi/hssf/record/DimensionsRecord.java +++ b/src/java/org/apache/poi/hssf/record/DimensionsRecord.java @@ -20,6 +20,8 @@ package org.apache.poi.hssf.record; import org.apache.poi.util.LittleEndianOutput; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Title: Dimensions Record

@@ -32,6 +34,9 @@ import org.apache.poi.util.LittleEndianOutput; */ public final class DimensionsRecord extends StandardRecord implements Cloneable { + + private static final POILogger logger = POILogFactory.getLogger(DimensionsRecord.class); + public final static short sid = 0x200; private int field_1_first_row; private int field_2_last_row; // plus 1 @@ -50,6 +55,11 @@ public final class DimensionsRecord extends StandardRecord implements Cloneable field_3_first_col = in.readShort(); field_4_last_col = in.readShort(); field_5_zero = in.readShort(); + //POI-61045 -- in practice, there can be an extra 2 bytes + if (in.available() == 2) { + logger.log(POILogger.INFO, "DimensionsRecord has extra 2 bytes."); + in.readShort(); + } } /** diff --git a/src/java/org/apache/poi/hssf/record/FormatRecord.java b/src/java/org/apache/poi/hssf/record/FormatRecord.java index 955c52c22..575f709fb 100644 --- a/src/java/org/apache/poi/hssf/record/FormatRecord.java +++ b/src/java/org/apache/poi/hssf/record/FormatRecord.java @@ -18,7 +18,10 @@ package org.apache.poi.hssf.record; import org.apache.poi.util.HexDump; +import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianOutput; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; import org.apache.poi.util.StringUtil; /** @@ -28,6 +31,9 @@ import org.apache.poi.util.StringUtil; * REFERENCE: PG 317 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2) */ public final class FormatRecord extends StandardRecord implements Cloneable { + + private static final POILogger logger = POILogFactory.getLogger(FormatRecord.class); + public final static short sid = 0x041E; private final int field_1_index_code; @@ -52,9 +58,9 @@ public final class FormatRecord extends StandardRecord implements Cloneable { field_3_hasMultibyte = (in.readByte() & 0x01) != 0; if (field_3_hasMultibyte) { - field_4_formatstring = in.readUnicodeLEString(field_3_unicode_len); + field_4_formatstring = readStringCommon(in, field_3_unicode_len, false); } else { - field_4_formatstring = in.readCompressedUnicode(field_3_unicode_len); + field_4_formatstring = readStringCommon(in, field_3_unicode_len, true); } } @@ -113,4 +119,55 @@ public final class FormatRecord extends StandardRecord implements Cloneable { public FormatRecord clone() { return new FormatRecord(this); } + + private static String readStringCommon(RecordInputStream ris, int requestedLength, boolean pIsCompressedEncoding) { + //custom copy of ris.readUnicodeLEString to allow for extra bytes at the end + + // Sanity check to detect garbage string lengths + if (requestedLength < 0 || requestedLength > 0x100000) { // 16 million chars? + throw new IllegalArgumentException("Bad requested string length (" + requestedLength + ")"); + } + char[] buf = null; + boolean isCompressedEncoding = pIsCompressedEncoding; + int availableChars = isCompressedEncoding ? ris.remaining() : ris.remaining() / LittleEndianConsts.SHORT_SIZE; + //everything worked out. Great! + int remaining = ris.remaining(); + if (requestedLength == availableChars) { + buf = new char[requestedLength]; + } else { + //sometimes in older Excel 97 .xls files, + //the requested length is wrong. + //Read all available characters. + buf = new char[availableChars]; + } + for (int i = 0; i < buf.length; i++) { + char ch; + if (isCompressedEncoding) { + ch = (char) ris.readUByte(); + } else { + ch = (char) ris.readShort(); + } + buf[i] = ch; + } + + //TIKA-2154's file shows that even in a unicode string + //there can be a remaining byte (without proper final '00') + //that should be read as a byte + if (ris.available() == 1) { + char[] tmp = new char[buf.length+1]; + System.arraycopy(buf, 0, tmp, 0, buf.length); + tmp[buf.length] = (char)ris.readUByte(); + buf = tmp; + } + + if (ris.available() > 0) { + logger.log(POILogger.INFO, "FormatRecord has "+ris.available()+" unexplained bytes. Silently skipping"); + //swallow what's left + while (ris.available() > 0) { + ris.readByte(); + } + } + return new String(buf); + } + } diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index 2953a5b1a..1a67ec53a 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -17,11 +17,11 @@ package org.apache.poi.hssf.extractor; +import static org.apache.poi.POITestCase.assertContains; +import static org.apache.poi.POITestCase.assertStartsWith; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.apache.poi.POITestCase.assertContains; -import static org.apache.poi.POITestCase.assertStartsWith; import java.io.File; import java.io.IOException; @@ -388,4 +388,13 @@ public final class TestExcelExtractor { assertNotNull(extractor.getText()); extractor.close(); } + + @Test + public void test61045() throws IOException { + //bug 61045. File is govdocs1 626534 + ExcelExtractor extractor = createExtractor("61045_govdocs1_626534.xls"); + String txt = extractor.getText(); + assertContains(txt, "NONBUSINESS"); + } + } diff --git a/test-data/spreadsheet/61045_govdocs1_626534.xls b/test-data/spreadsheet/61045_govdocs1_626534.xls new file mode 100644 index 0000000000000000000000000000000000000000..e285403fe51fe4e6beb29c3c00bc4d777b75b549 GIT binary patch literal 10752 zcmeI2e{5S<6~}L0UVb=^lQ?dgg)Yr=GDH&6V5g+X8oJs^TvDxZf}M(%WlA~-YPX6_ zQMv^-*kjXBC-}p#{ek!~+6j&QH3ex@MF?R)OxZfI00M1AZ0-Q5hzS)m=mgu$IrpA( z-g7!nK>1?`KkMcBe&4z0p8I+C-CxeebK%12Z@=q;+H~BYs@0ptfU5QBRq!28eY7ZL z!52)wSu7Tfs1$sy9REWE-hf!~`Qd59L#%-lpE;&CAB~`g;B11k8BS;dDhy{UoDn#y z;oO|3l743s$f(=lv!Wc8QlEh9vic-E#ee~}0YHF@j3PgVK*iw{HT z%j!0Di;m;AGC9l3t;(w6^6^^U(z_FcuT9r@>8t(r@0NeH?O}A0^m1jt4Bt^}Ahm;WKQe&hv zvCVKkVOz~Q)r^DDP%JhaAHoo~!Fmwg&0M>@5iK6v+Mf%NU>Q_*OOAi zxsI)qQp?K250N4`oX?UXI0`N)9uD>_IMflc5gh%$Bt>v^{)iO8;f|9cINrUkadZMw z#<43FM;x7kLp}Lxovl-FxL-2tJ7HMH!5R){m25m5!=wm~;yF?T$2DIhMQ{`hj!q>w z)=rR(hl4!}4tJ4k1jnQ2NnsrB%G0EHI3`FD9N#~uadbh2JRG`pba^=bbU|n95*+H> zFGvv_i=QFI!?Bwb#^FYvBSmnWeV7!%(Q96xE)PfkZnE)kuxG)M_&C`xj>Htl%sBE< zj#+T16y;(Z?)pzPj%W#vo!B~}f@ARmf6&>Y9*(D9Bt>vgla4CEK}|6VL%ey#5?v;s zD7=;Q($#;3{0NSgFOVXwBXKV&fma)DMCX$H?Ym&HL_tE%`fbxvY5u#Ryk&&Q8-Dtn8twFHE@A~iXHg-uK)XXjEGXJ(%>HCp1Ura4MOxtq3T!P;+3e-Ro9r0jRE6qYLZ_zy(-SQ7B^bFA*_` z{eiFi2uK8E(DBN__z_5H{OA0J9-2mcXBGRYt;jSnU1Sbq0Zz>EK3P zTX#%B8Co_*6)7^b}431_Wjw!NX9L}0)9as>xjzb0qHfd6(3=Zr7$n%<+hheKGMX_fOhZ)wfnHxVR zAK{o8$HwKqQ#^u0-AD@K$lq^T2kwV?I7SdhSa8&5%&QR=9L~Adw4<Le@U&S#qj>63xv*1X~l4r(| z_}q)Sb>Oz3hhr3RLOmF(Q43v*Ts5k=AkE^c^_P zQtQ}$nrs9IwT_6db&QdX;E0*NBO*8sb#ly%feqqzI0`J(hr2lnT_tE^cj& z`d;W+oTTVu5A5i!!e>wCgP~EaRH;K&97sN?Q#qV6Ru_TE#O)`Rk1Z`b>7~2kP?L4< zHhim5$>n25j{g^LY`3okukA{8U)S9;(9;uJ=o_2w<-~m7b{khF*OuJBDYNvkn@f$l zp;f?Yn)40DO8r0@UA+hTk?ze_o4k?fi0W56m0G}1`W=pplJmVlM07y;sG3g>rr=mow|4 zy_^NLML*%c79D*xfAUU={wm+9=vT|9N;wP@u7+m~JUh+*HNM4qUm1O%jNV^H54@$W zpvTei;F?Vz{CsQ6=yhfErZRd%8NIcPUVVlBkY%fuTy+tz;asi0w&rYojpXX|Rppev zZpqaYAArej1g6gTlUMfW($Y#C7r~}WY$@pyKMkE79(R~d5|dT%$-_Jz4oq6`Su;kM ztcFhx=HPVK26~GytEOZPbYm{8C@5JAfJmuAh9y0T{1*$p)DJ`#j?; z_mo-6V;GI_@g+krRrTd-1kUTdJ)8VIlVxM`isF?X|N02L!yFuB`tn z>wn#GR@VQO^}lbyTUr0ZTTxm6t9Ru3-|Rlfc1XhP9pVl(Z3H` zI~2Sfa`ti^DY6}sI7qt>ydC1a@LRnd0xNm1b;M&BP&>@3X&-J+k9O29+aX(Dbx4t) z>7LJN*X_RT5VOPAF54j+_Fu?`cZb{~KOsf7LyrEC6v5$so)ixUdzPQ+M$J#-?J&_c zFEVe>6 U-ewv9mj>@0s`)(G;2ci<6OxQ)#Q*>R literal 0 HcmV?d00001