From ede8beb2278b26852141f1dc6a97ec699d49136d Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 5 May 2010 17:49:59 +0000 Subject: [PATCH] Fix bug #49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@941399 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../poi/xssf/usermodel/XSSFVMLDrawing.java | 10 +- .../util/EvilUnclosedBRFixingInputStream.java | 116 ++++++++++++++++++ .../poi/xssf/usermodel/TestXSSFBugs.java | 10 ++ .../TestEvilUnclosedBRFixingInputStream.java | 79 ++++++++++++ test-data/spreadsheet/BrNotClosed.xlsx | Bin 0 -> 9625 bytes 6 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java create mode 100644 src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java create mode 100644 test-data/spreadsheet/BrNotClosed.xlsx diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 78cb7d240..6d8c5e58c 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags 49050 - Improve performance of AbstractEscherHolderRecord when there are lots of Continue Records 49194 - Correct text size limit for OOXML .xlsx files 49254 - Fix CellUtils.setFont to use the correct type internally diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java index a27d657bc..a1458662b 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java @@ -20,6 +20,7 @@ package org.apache.poi.xssf.usermodel; import org.apache.poi.POIXMLDocumentPart; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.xssf.util.EvilUnclosedBRFixingInputStream; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlOptions; import org.apache.xmlbeans.XmlObject; @@ -53,6 +54,11 @@ import schemasMicrosoftComOfficeExcel.STObjectType; * considered a deprecated format included in Office Open XML for legacy reasons only and new applications that * need a file format for drawings are strongly encouraged to use preferentially DrawingML *

+ * + *

+ * Warning - Excel is known to put invalid XML into these files! + * For example, >br< without being closed or escaped crops up. + *

* * See 6.4 VML - SpreadsheetML Drawing in Office Open XML Part 4 - Markup Language Reference.pdf * @@ -98,7 +104,9 @@ public final class XSSFVMLDrawing extends POIXMLDocumentPart { protected void read(InputStream is) throws IOException, XmlException { - XmlObject root = XmlObject.Factory.parse(is); + XmlObject root = XmlObject.Factory.parse( + new EvilUnclosedBRFixingInputStream(is) + ); _qnames = new ArrayList(); _items = new ArrayList(); diff --git a/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java b/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java new file mode 100644 index 000000000..7e373b393 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java @@ -0,0 +1,116 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xssf.util; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +/** + * This is a seriously sick fix for the fact that some .xlsx + * files contain raw bits of HTML, without being escaped + * or properly turned into XML. + * The result is that they contain things like >br<, + * which breaks the XML parsing. + * This very sick InputStream wrapper attempts to spot + * these go past, and fix them. + * Only works for UTF-8 and US-ASCII based streams! + * It should only be used where experience shows the problem + * can occur... + */ +public class EvilUnclosedBRFixingInputStream extends InputStream { + private InputStream source; + private byte[] spare; + + private static byte[] detect = new byte[] { + (byte)'<', (byte)'b', (byte)'r', (byte)'>' + }; + + public EvilUnclosedBRFixingInputStream(InputStream source) { + this.source = source; + } + + /** + * Warning - doesn't fix! + */ + @Override + public int read() throws IOException { + return source.read(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if(spare != null) { + // This is risky, but spare is normally only a byte or two... + System.arraycopy(spare, 0, b, off, spare.length); + int ret = spare.length; + spare = null; + return ret; + } + + int read = source.read(b, off, len); + read = fixUp(b, off, read); + return read; + } + + @Override + public int read(byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + private int fixUp(byte[] b, int offset, int read) { + // Find places to fix + ArrayList fixAt = new ArrayList(); + for(int i=offset; i 0) { + spare = new byte[overshoot]; + System.arraycopy(b, b.length-overshoot, spare, 0, overshoot); + read -= overshoot; + } + + // Fix them, in reverse order so the + // positions are valid + for(int j=fixAt.size()-1; j>=0; j--) { + int i = fixAt.get(j); + + byte[] tmp = new byte[read-i-3]; + System.arraycopy(b, i+3, tmp, 0, tmp.length); + b[i+3] = (byte)'/'; + System.arraycopy(tmp, 0, b, i+4, tmp.length); + // It got one longer + read++; + } + return read; + } +} diff --git a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java index 1d8a8fb54..7d3df7427 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java @@ -138,4 +138,14 @@ public final class TestXSSFBugs extends BaseTestBugzillaIssues { assertEquals(1, rels.size()); assertEquals("Sheet1!A1", rels.get(0).getPackageRelationship().getTargetURI().getFragment()); } + + /** + * Excel will sometimes write a button with a textbox + * containing >br< (not closed!). + * Clearly Excel shouldn't do this, but test that we can + * read the file despite the naughtyness + */ + public void test49020() throws Exception { + XSSFWorkbook wb = XSSFTestDataSamples.openSampleWorkbook("BrNotClosed.xlsx"); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java b/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java new file mode 100644 index 000000000..799d3df36 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java @@ -0,0 +1,79 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xssf.util; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + +import junit.framework.TestCase; + +public final class TestEvilUnclosedBRFixingInputStream extends TestCase { + public void testOK() throws Exception { + byte[] ok = "

Hello There!
Tags!

".getBytes("UTF-8"); + + EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream( + new ByteArrayInputStream(ok) + ); + + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + boolean going = true; + while(going) { + byte[] b = new byte[1024]; + int r = inp.read(b); + if(r > 0) { + bout.write(b, 0, r); + } else { + going = false; + } + } + + byte[] result = bout.toByteArray(); + assertEquals(ok, result); + } + + public void testProblem() throws Exception { + byte[] orig = "

Hello
There!
Tags!

".getBytes("UTF-8"); + byte[] fixed = "

Hello
There!
Tags!

".getBytes("UTF-8"); + + EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream( + new ByteArrayInputStream(orig) + ); + + ByteArrayOutputStream bout = new ByteArrayOutputStream(); + boolean going = true; + while(going) { + byte[] b = new byte[1024]; + int r = inp.read(b); + if(r > 0) { + bout.write(b, 0, r); + } else { + going = false; + } + } + + byte[] result = bout.toByteArray(); + assertEquals(fixed, result); + } + + protected void assertEquals(byte[] a, byte[] b) { + assertEquals(a.length, b.length); + for(int i=0; ivC}Qh*3nd&W7aUmwF( z7uN)RG+;81xB9y;i$J1+ky~x8_l+lVm4Y=xVP^p0y_mFv>Lq!>1ue5G#E^l_z^FWI zG!Y9)#x~iRud6&>ra+IJ0*0vIKr+BAcUxN+Au<|~a(!J5r#$rj zAoS1|`Z!v;n*v9nwxxN{Oynd42h9HI?P9|g_Srm-HAzg*H+6+de8#!g4=SLeY;z9?#9BmSYLTMP{E9H-v)Ukt?$ZPIg+~L-s^25xx|>sA=R@aBye);%%Iw z7%7s4D=-Mz5Hm+B9+@vn7af!dz(3BM!2WPWWdff?H}_oWj0c_d+Y3nV1;&X?v87X)G-~r$s(A}2p4|8{O zaI!IWaIpE2fBrOmgde64%T)ijkJk8)*4^w_fd@gY0W+Sh4HL;YR9{t5-zJ{`1KYk+ zHqg|~gWh=uq1nTCwgo;%$dlj#mCo+H3)c5mULMxQ(2n<~(rqrOjS6mD8@xopi>OZ8 zdPR#t&9NIXxv-z)L4AbX_{subkJ-jj;+i0CFd3;k?AUZIA7aFgV`e#;@y+x>m!JZO z#3-GD;TW z$kx5i7yKc`u!f}DRE`~UI?#ytYNqxg#^*%!`i5tXwjsr5fFle$G*JZ0WD_0D^O;kMdh-Mi9_ufb|o6YL?QyHxWJ3_ni+D&kZg z^i32QiH;?7eZH0w(@J7kd77dJKk!N%UHZ#td%1*-RdrEd;Kwbht~qEB&*l}y!?>Yl zDOS?MOhv41avya0=|wluOFIG!ZpLA$_6J=##;;s#1Ra$7TE}J56@@0Gs$mv9(WR+{*-t^7d) z)A*Nti$jRK<~_KThY_#ztE!*WPCS`iL*s>7K>3@{RxOK7Yu@L3q0o)XBoD!voNI@( z!4&<^rPwN@ZSqF~0Mtl+5XYZO;bLiKXU6vP%<;o zLN%sJCe3NPxS`6C)D)hcoH@cz#T3Tp2SU$?T6<-;f!SexbPAdHNCH_?+@ix~DJU+X z+Vk+jCO+IWn28}o$*6R5`jgkK15=Zzi}HjV^~QEadP~>iu3P>`E*}r^IvFfoc=}hD*o_<;)W4r{5*Z6ULZ*ch@-g?CVv^Th8HbPHEG0Un#yB zlQpXr$LJTHN;xE%H23!W*hv!G! z{1M=Q^E|a$to`|e@z&TAL^B~mYN0PeLo1wRu(GRa`_{3v!?rw zQEplyM-1@#*$Zx_WP!$omMC9IjMf2X{1#-avKU~oW5#Fh{1E|8UyL>1lNowF>urfJ z>8o$khST6elkI~p#PS>ObCEKU#-=jle?3o(#o60e*`9HYx%8J4R@o$%Ad{C<;LN9{ zk0pqJSRweNrEH;BKx}K>2wnM^COkq&o28}i4ao+AaK5A-J8zLd4?3LD#K%zycf96`r`YCR($VO zRD;Pz`I(KM#H!s>iG;po3^;J)6FU(&2lg>7c&Fg>ExaN07vKhq>mY?I ziL$(Q>q%89>g7dR#9wP@#L!jKmfg)_=qv@_c6PM{SR4uaJu_HzUz}WUbDj3w+1W37 zE^Je;hxe?gt@%ZjclB0$UXsGMKu%s|XGwe633|C1=3Z~2Pdh`vxb=`in!vugCV?gq z!&)Si3gKKv(m;Mc@|G`FRDSX@$T+mmyJm;cQ^u#H8&FjF)FIzOk622pJ4 zbqpx*=h{qxc{?K`>lgNrGT-*NuTbgHhzZga$J8wj3YQSmn^haL!c#zGz4Aw1ym?mM zQC{$&9oSgQU)*2O?yvWyjqga6pc9R4t5qm2nwgc3K4MH|-iBxD0r@w_aVQ;SM=prkF`efB1U z5XMceLL7T~d5KXra8hBpUE$}|Ogg3z!UT z0!tm7ZTlU~spxh-vzA+4r@L5*Y!rZkdKHI?NO`54EPwJ!v@CgFFPTOTB`uUOlnIv! z8cK@8`-!Kj_zZ#>gtc9mo^RH%tk|1- zs3~+rf_YYB&!bjw-ZLV6GfW~wW26Qy6~Dn>GaXv_uA~7F8J$`2+k^*`cQtoBEonS0$ECh@Yl|WY4nJ=R~QCObP zpwpmj0DhBkEM#QMGOK;Ghn@&-h_zAQ6vVcmxU-gC4KJ8(_GvGFuh%4_ zkNaxg)$%4;(`q&ctpa!O??%a&!U57I;z1Emqv|3KKyQ=>^_g$6S7r-941m z8pVVraNX_sv4vUN1#0bM6#~$#NGXZ(dsb+c6ud)7N(jBOD)Sj6S{`YbrfO^dV9GPy zu2_uof~G|E8M=HyzJ<6WE+W%oy47@a4rE>HG%5Kh@0mF8BzH_I8}TShs`WHUe~S;| zHdxX1cxjg(eQ}Jp*@S|XPjmU<(7f%F9mgc=W9c5>shL~oI+cT~H=6r=@r}}(L?Oa~ zN_E-O_@Eh$U@h0uC81mC)m6S`hu`X_Nmh;p7p9v=*qi7d`uV9D&L4R_Qn^$1Jv&w# z+IO+k76mQsq=G#v<_K~G?9(hedb=4K+<0xR+r>IF!p?}s7cU(2zqDE3Y?Ph9Pes3Y zKp=qv=Pmc@$Q$FpVZPK8D=g_Ckv4|M%f#Q+`GpGE!~F^0eky@;_3jQ*P1Qn($Q#wC zKyPzyi8uV+GlO_kCH6W_n))dl{v$D5vW`o$-ivMxX!3w*N zR9A4AXGd?toOwC=NsLv-#KLa8xWTVNthzgVPkC1rFNk?C?145n^CQQr9VzW$LS;Oz z*cmzy=!4XNjFy%E6O&oG0y~pbd?sy2f#O8V4$&<-IL3xz?we~XkuE7|202<9v_>)c zGZ04&uIdE(F-^yA__XS=mL=IN`|R7q0nxXMdp9LT?%u1Eh;D)f9Iv2w6S&dLvk0&zHc9$oJ zIi`N#sY+f)OswA5ptdQ3ZIrKX>0|dj{;NVW9nwmoSDO;@@8piDYtjUi$_O_kV;9X2HfqRHY)0S zdNVFEC#6|xKPHemJ|3LY!{kNgX*r+LZ72aVLF)Mp+;3$|HU_FCK3y%+4dB(7I4;++ z&+@irooJ^W_hZW@H4gj96z9E1qZScM7&mNN6OHe?SPhGRy}Z0Y#bFClHapQ5JUxET z!>QSfii#KCcpr*;QrCpvFW?zM^o){hTmPv|!Kmyq`V&%7_-N0!(S{^ns**_`Gf!0Z z7j>DVZS;k8piDi5fN z)4njHEi>=fY)P(zk0I(pdS#Rr__@&NiElZt#orFg?I9_?1Z5~0FoaQL;`IvjY?V=Y zmpt()q8gHH%Swdv$76N!fFpXxLC}&=LnVB!@ND>bT(2^2psr$Q3=+TR8&rsQI4zN{% zqMoP~7m2b(vJ5b|mB>-KC40T)_Bkc1=>so87VR7Nx2n^XSaQ!+x*Bb9ZiVK%ZeY1) z>+HJUJq60TrI3~X!ugVG!AOh`bzE`I7vkCVT-a$~upCh$9S};8Eo(ho=>ICVgfPKx z4bs1bajUGV1l{>^Dwi!}imK>vPODJjHKZ0j?^$;H+eOayaHIZF*${_CyML2=f3lKD zRRtv&A!(yN0#Pk#3t)lLrzj0YcYOniz-Ky*QPOMi?j%=d?!+VlPPJ<(XV?0i?Kk_G z+A5*R-oWms_k+DYB%LY%4;*4F_zZJjSB_S{f#bznfjD_^>N}rDvcwUY9aj5l=7Y?% znRU0>?>Es`CZvmnTN%VWPR1r4w>qXHp2Vf-RWv4m+tg-w|_92CRS|h z#>D38UV@{UYXgrd+$7>>49bkigx!z=SHY5~Wn;x?*&Nx0fZ1jp_Lug4TtWj$taKg8 z_vNaYJSe`RHoY4+Y>)R%W{f4ni{5FQGZ|Q+O%1nx{`kelDei0`eo~v@E}z8Hg6--Y z3-qYdG5!+#1k8%gKr-RUF7L-}2(^aT`%t}7>;%a)OxBK1Lp2UQFTT zP!`fRsXb11anAwz^V+sDZNx&`Smz&yhxm+cC1O)^{d(hsc`!0x z)8Gh)OV@V*TP3K)VE^0Iup{4$>jB*DFi6|)<`w80MO01otIOD!8HatD*V{>N<4}E= zbGe6m`U4JC`OxE-W=F?8{5U_Us|(BWNbO$TBiP8Sd6Ukr*g79-9!erJkSJ*#L_?6) zbe}WQMI$L#C+u}Wg{GZw1q*9zA`efo7hRx!@{6vi9lCL@u8m1=z4;68KBVV(`1|iy zTxWl)@v_$zgyJxbTfw{l8&(fCbudwOasWHCnK(F^{Rnpyx&JK#!!!qoj8{?WroanY zQn<#%IA9s2vzDHxwT77&l4D&W z|BeY*#9Cn31;i{bogV0KIQ}pYz;q?4J)J5?N4Pd-Q*pf!DBIg2+o;JF*#_0q+I6mfgP=P;gGNh z6tT6U1(%)c*~7kzdU4S06C*7GEoP4t`n(MC+qkb9xnrXpCNaY=zEfOobdgkKXteSn zgoI`DPS)_E$ok|5_}CUnkwd=e_3PPnAq@n4Z5iF`8$86_+_H^7p04c4d9&`gaQY(` z{Xr9fa4dj-_bkW17VKZw-#po&EcbVSzqg+MCHUhS5A%w@G@|bc{=JX)S3w_Gi{RgP z_3q-_?eF|Ts(?MNa;MvKSNLv6;+OCv)Zg|e?gHGc5B&nz#rpr>{y$ZV?gHMe1N;I+ z$GHpmtGx1`qiDbyFMlfK&!6@4^0T&Z7v=6&`qyvC!uyHx?>5$Vf$nZcegWkX{sj89 zIk_u(m*oBu4JH2VKlw>{@1oqLWxr5JVKoAnKJJjRyW^iE?Jnrwna3|k03eza0Qd(F mxhwwn?D1D|6wqJ9|I8)Ia)>au008d8UT