From ede8beb2278b26852141f1dc6a97ec699d49136d Mon Sep 17 00:00:00 2001
From: Nick Burch
Date: Wed, 5 May 2010 17:49:59 +0000
Subject: [PATCH] Fix bug #49020 - Workaround Excel outputting invalid XML in
button definitions by not closing BR tags
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@941399 13f79535-47bb-0310-9956-ffa450edef68
---
src/documentation/content/xdocs/status.xml | 1 +
.../poi/xssf/usermodel/XSSFVMLDrawing.java | 10 +-
.../util/EvilUnclosedBRFixingInputStream.java | 116 ++++++++++++++++++
.../poi/xssf/usermodel/TestXSSFBugs.java | 10 ++
.../TestEvilUnclosedBRFixingInputStream.java | 79 ++++++++++++
test-data/spreadsheet/BrNotClosed.xlsx | Bin 0 -> 9625 bytes
6 files changed, 215 insertions(+), 1 deletion(-)
create mode 100644 src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java
create mode 100644 src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java
create mode 100644 test-data/spreadsheet/BrNotClosed.xlsx
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 78cb7d240..6d8c5e58c 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
+ 49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags
49050 - Improve performance of AbstractEscherHolderRecord when there are lots of Continue Records
49194 - Correct text size limit for OOXML .xlsx files
49254 - Fix CellUtils.setFont to use the correct type internally
diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java
index a27d657bc..a1458662b 100644
--- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java
+++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFVMLDrawing.java
@@ -20,6 +20,7 @@ package org.apache.poi.xssf.usermodel;
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.xssf.util.EvilUnclosedBRFixingInputStream;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions;
import org.apache.xmlbeans.XmlObject;
@@ -53,6 +54,11 @@ import schemasMicrosoftComOfficeExcel.STObjectType;
* considered a deprecated format included in Office Open XML for legacy reasons only and new applications that
* need a file format for drawings are strongly encouraged to use preferentially DrawingML
*
+ *
+ *
+ * Warning - Excel is known to put invalid XML into these files!
+ * For example, >br< without being closed or escaped crops up.
+ *
*
* See 6.4 VML - SpreadsheetML Drawing in Office Open XML Part 4 - Markup Language Reference.pdf
*
@@ -98,7 +104,9 @@ public final class XSSFVMLDrawing extends POIXMLDocumentPart {
protected void read(InputStream is) throws IOException, XmlException {
- XmlObject root = XmlObject.Factory.parse(is);
+ XmlObject root = XmlObject.Factory.parse(
+ new EvilUnclosedBRFixingInputStream(is)
+ );
_qnames = new ArrayList();
_items = new ArrayList();
diff --git a/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java b/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java
new file mode 100644
index 000000000..7e373b393
--- /dev/null
+++ b/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputStream.java
@@ -0,0 +1,116 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+/**
+ * This is a seriously sick fix for the fact that some .xlsx
+ * files contain raw bits of HTML, without being escaped
+ * or properly turned into XML.
+ * The result is that they contain things like >br<,
+ * which breaks the XML parsing.
+ * This very sick InputStream wrapper attempts to spot
+ * these go past, and fix them.
+ * Only works for UTF-8 and US-ASCII based streams!
+ * It should only be used where experience shows the problem
+ * can occur...
+ */
+public class EvilUnclosedBRFixingInputStream extends InputStream {
+ private InputStream source;
+ private byte[] spare;
+
+ private static byte[] detect = new byte[] {
+ (byte)'<', (byte)'b', (byte)'r', (byte)'>'
+ };
+
+ public EvilUnclosedBRFixingInputStream(InputStream source) {
+ this.source = source;
+ }
+
+ /**
+ * Warning - doesn't fix!
+ */
+ @Override
+ public int read() throws IOException {
+ return source.read();
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ if(spare != null) {
+ // This is risky, but spare is normally only a byte or two...
+ System.arraycopy(spare, 0, b, off, spare.length);
+ int ret = spare.length;
+ spare = null;
+ return ret;
+ }
+
+ int read = source.read(b, off, len);
+ read = fixUp(b, off, read);
+ return read;
+ }
+
+ @Override
+ public int read(byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ private int fixUp(byte[] b, int offset, int read) {
+ // Find places to fix
+ ArrayList fixAt = new ArrayList();
+ for(int i=offset; i 0) {
+ spare = new byte[overshoot];
+ System.arraycopy(b, b.length-overshoot, spare, 0, overshoot);
+ read -= overshoot;
+ }
+
+ // Fix them, in reverse order so the
+ // positions are valid
+ for(int j=fixAt.size()-1; j>=0; j--) {
+ int i = fixAt.get(j);
+
+ byte[] tmp = new byte[read-i-3];
+ System.arraycopy(b, i+3, tmp, 0, tmp.length);
+ b[i+3] = (byte)'/';
+ System.arraycopy(tmp, 0, b, i+4, tmp.length);
+ // It got one longer
+ read++;
+ }
+ return read;
+ }
+}
diff --git a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java
index 1d8a8fb54..7d3df7427 100644
--- a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java
+++ b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFBugs.java
@@ -138,4 +138,14 @@ public final class TestXSSFBugs extends BaseTestBugzillaIssues {
assertEquals(1, rels.size());
assertEquals("Sheet1!A1", rels.get(0).getPackageRelationship().getTargetURI().getFragment());
}
+
+ /**
+ * Excel will sometimes write a button with a textbox
+ * containing >br< (not closed!).
+ * Clearly Excel shouldn't do this, but test that we can
+ * read the file despite the naughtyness
+ */
+ public void test49020() throws Exception {
+ XSSFWorkbook wb = XSSFTestDataSamples.openSampleWorkbook("BrNotClosed.xlsx");
+ }
}
diff --git a/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java b/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java
new file mode 100644
index 000000000..799d3df36
--- /dev/null
+++ b/src/ooxml/testcases/org/apache/poi/xssf/util/TestEvilUnclosedBRFixingInputStream.java
@@ -0,0 +1,79 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+
+import junit.framework.TestCase;
+
+public final class TestEvilUnclosedBRFixingInputStream extends TestCase {
+ public void testOK() throws Exception {
+ byte[] ok = "Hello There!
Tags!
".getBytes("UTF-8");
+
+ EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
+ new ByteArrayInputStream(ok)
+ );
+
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ boolean going = true;
+ while(going) {
+ byte[] b = new byte[1024];
+ int r = inp.read(b);
+ if(r > 0) {
+ bout.write(b, 0, r);
+ } else {
+ going = false;
+ }
+ }
+
+ byte[] result = bout.toByteArray();
+ assertEquals(ok, result);
+ }
+
+ public void testProblem() throws Exception {
+ byte[] orig = "Hello
There!
Tags!
".getBytes("UTF-8");
+ byte[] fixed = "Hello
There!
Tags!
".getBytes("UTF-8");
+
+ EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
+ new ByteArrayInputStream(orig)
+ );
+
+ ByteArrayOutputStream bout = new ByteArrayOutputStream();
+ boolean going = true;
+ while(going) {
+ byte[] b = new byte[1024];
+ int r = inp.read(b);
+ if(r > 0) {
+ bout.write(b, 0, r);
+ } else {
+ going = false;
+ }
+ }
+
+ byte[] result = bout.toByteArray();
+ assertEquals(fixed, result);
+ }
+
+ protected void assertEquals(byte[] a, byte[] b) {
+ assertEquals(a.length, b.length);
+ for(int i=0; ivC}Qh*3nd&W7aUmwF(
z7uN)RG+;81xB9y;i$J1+ky~x8_l+lVm4Y=xVP^p0y_mFv>Lq!>1ue5G#E^l_z^FWI
zG!Y9)#x~iRud6&>ra+IJ0*0vIKr+BAcUxN+Au<|~a(!J5r#$rj
zAoS1|`Z!v;n*v9nwxxN{Oynd42h9HI?P9|g_Srm-HAzg*H+6+de8#!g4=SLeY;z9?#9BmSYLTMP{E9H-v)Ukt?$ZPIg+~L-s^25xx|>sA=R@aBye);%%Iw
z7%7s4D=-Mz5Hm+B9+@vn7af!dz(3BM!2WPWWdff?H}_oWj0c_d+Y3nV1;&X?v87X)G-~r$s(A}2p4|8{O
zaI!IWaIpE2fBrOmgde64%T)ijkJk8)*4^w_fd@gY0W+Sh4HL;YR9{t5-zJ{`1KYk+
zHqg|~gWh=uq1nTCwgo;%$dlj#mCo+H3)c5mULMxQ(2n<~(rqrOjS6mD8@xopi>OZ8
zdPR#t&9NIXxv-z)L4AbX_{subkJ-jj;+i0CFd3;k?AUZIA7aFgV`e#;@y+x>m!JZO
z#3-GD;TW
z$kx5i7yKc`u!f}DRE`~UI?#ytYNqxg#^*%!`i5tXwjsr5fFle$G*JZ0WD_0D^O;kMdh-Mi9_ufb|o6YL?QyHxWJ3_ni+D&kZg
z^i32QiH;?7eZH0w(@J7kd77dJKk!N%UHZ#td%1*-RdrEd;Kwbht~qEB&*l}y!?>Yl
zDOS?MOhv41avya0=|wluOFIG!ZpLA$_6J=##;;s#1Ra$7TE}J56@@0Gs$mv9(WR+{*-t^7d)
z)A*Nti$jRK<~_KThY_#ztE!*WPCS`iL*s>7K>3@{RxOK7Yu@L3q0o)XBoD!voNI@(
z!4&<^rPwN@ZSqF~0Mtl+5XYZO;bLiKXU6vP%<;o
zLN%sJCe3NPxS`6C)D)hcoH@cz#T3Tp2SU$?T6<-;f!SexbPAdHNCH_?+@ix~DJU+X
z+Vk+jCO+IWn28}o$*6R5`jgkK15=Zzi}HjV^~QEadP~>iu3P>`E*}r^IvFfoc=}hD*o_<;)W4r{5*Z6ULZ*ch@-g?CVv^Th8HbPHEG0Un#yB
zlQpXr$LJTHN;xE%H23!W*hv!G!
z{1M=Q^E|a$to`|e@z&TAL^B~mYN0PeLo1wRu(GRa`_{3v!?rw
zQEplyM-1@#*$Zx_WP!$omMC9IjMf2X{1#-avKU~oW5#Fh{1E|8UyL>1lNowF>urfJ
z>8o$khST6elkI~p#PS>ObCEKU#-=jle?3o(#o60e*`9HYx%8J4R@o$%Ad{C<;LN9{
zk0pqJSRweNrEH;BKx}K>2wnM^COkq&o28}i4ao+AaK5A-J8zLd4?3LD#K%zycf96`r`YCR($VO
zRD;Pz`I(KM#H!s>iG;po3^;J)6FU(&2lg>7c&Fg>ExaN07vKhq>mY?I
ziL$(Q>q%89>g7dR#9wP@#L!jKmfg)_=qv@_c6PM{SR4uaJu_HzUz}WUbDj3w+1W37
zE^Je;hxe?gt@%ZjclB0$UXsGMKu%s|XGwe633|C1=3Z~2Pdh`vxb=`in!vugCV?gq
z!&)Si3gKKv(m;Mc@|G`FRDSX@$T+mmyJm;cQ^u#H8&FjF)FIzOk622pJ4
zbqpx*=h{qxc{?K`>lgNrGT-*NuTbgHhzZga$J8wj3YQSmn^haL!c#zGz4Aw1ym?mM
zQC{$&9oSgQU)*2O?yvWyjqga6pc9R4t5qm2nwgc3K4MH|-iBxD0r@w_aVQ;SM=prkF`efB1U
z5XMceLL7T~d5KXra8hBpUE$}|Ogg3z!UT
z0!tm7ZTlU~spxh-vzA+4r@L5*Y!rZkdKHI?NO`54EPwJ!v@CgFFPTOTB`uUOlnIv!
z8cK@8`-!Kj_zZ#>gtc9mo^RH%tk|1-
zs3~+rf_YYB&!bjw-ZLV6GfW~wW26Qy6~Dn>GaXv_uA~7F8J$`2+k^*`cQtoBEonS0$ECh@Yl|WY4nJ=R~QCObP
zpwpmj0DhBkEM#QMGOK;Ghn@&-h_zAQ6vVcmxU-gC4KJ8(_GvGFuh%4_
zkNaxg)$%4;(`q&ctpa!O??%a&!U57I;z1Emqv|3KKyQ=>^_g$6S7r-941m
z8pVVraNX_sv4vUN1#0bM6#~$#NGXZ(dsb+c6ud)7N(jBOD)Sj6S{`YbrfO^dV9GPy
zu2_uof~G|E8M=HyzJ<6WE+W%oy47@a4rE>HG%5Kh@0mF8BzH_I8}TShs`WHUe~S;|
zHdxX1cxjg(eQ}Jp*@S|XPjmU<(7f%F9mgc=W9c5>shL~oI+cT~H=6r=@r}}(L?Oa~
zN_E-O_@Eh$U@h0uC81mC)m6S`hu`X_Nmh;p7p9v=*qi7d`uV9D&L4R_Qn^$1Jv&w#
z+IO+k76mQsq=G#v<_K~G?9(hedb=4K+<0xR+r>IF!p?}s7cU(2zqDE3Y?Ph9Pes3Y
zKp=qv=Pmc@$Q$FpVZPK8D=g_Ckv4|M%f#Q+`GpGE!~F^0eky@;_3jQ*P1Qn($Q#wC
zKyPzyi8uV+GlO_kCH6W_n))dl{v$D5vW`o$-ivMxX!3w*N
zR9A4AXGd?toOwC=NsLv-#KLa8xWTVNthzgVPkC1rFNk?C?145n^CQQr9VzW$LS;Oz
z*cmzy=!4XNjFy%E6O&oG0y~pbd?sy2f#O8V4$&<-IL3xz?we~XkuE7|202<9v_>)c
zGZ04&uIdE(F-^yA__XS=mL=IN`|R7q0nxXMdp9LT?%u1Eh;D)f9Iv2w6S&dLvk0&zHc9$oJ
zIi`N#sY+f)OswA5ptdQ3ZIrKX>0|dj{;NVW9nwmoSDO;@@8piDYtjUi$_O_kV;9X2HfqRHY)0S
zdNVFEC#6|xKPHemJ|3LY!{kNgX*r+LZ72aVLF)Mp+;3$|HU_FCK3y%+4dB(7I4;++
z&+@irooJ^W_hZW@H4gj96z9E1qZScM7&mNN6OHe?SPhGRy}Z0Y#bFClHapQ5JUxET
z!>QSfii#KCcpr*;QrCpvFW?zM^o){hTmPv|!Kmyq`V&%7_-N0!(S{^ns**_`Gf!0Z
z7j>DVZS;k8piDi5fN
z)4njHEi>=fY)P(zk0I(pdS#Rr__@&NiElZt#orFg?I9_?1Z5~0FoaQL;`IvjY?V=Y
zmpt()q8gHH%Swdv$76N!fFpXxLC}&=LnVB!@#Yy9y3Wf<(~wIxzBK{{sj^xRiMi
zq^`o_fFgca={~ND>bT(2^2psr$Q3=+TR8&rsQI4zN{%
zqMoP~7m2b(vJ5b|mB>-KC40T)_Bkc1=>so87VR7Nx2n^XSaQ!+x*Bb9ZiVK%ZeY1)
z>+HJUJq60TrI3~X!ugVG!AOh`bzE`I7vkCVT-a$~upCh$9S};8Eo(ho=>ICVgfPKx
z4bs1bajUGV1l{>^Dwi!}imK>vPODJjHKZ0j?^$;H+eOayaHIZF*${_CyML2=f3lKD
zRRtv&A!(yN0#Pk#3t)lLrzj0YcYOniz-Ky*QPOMi?j%=d?!+VlPPJ<(XV?0i?Kk_G
z+A5*R-oWms_k+DYB%LY%4;*4F_zZJjSB_S{f#bznfjD_^>N}rDvcwUY9aj5l=7Y?%
znRU0>?>Es`CZvmnTN%VWPR1r4w>qXHp2Vf-RWv4m+tg-w|_92CRS|h
z#>D38UV@{UYXgrd+$7>>49bkigx!z=SHY5~Wn;x?*&Nx0fZ1jp_Lug4TtWj$taKg8
z_vNaYJSe`RHoY4+Y>)R%W{f4ni{5FQGZ|Q+O%1nx{`kelDei0`eo~v@E}z8Hg6--Y
z3-qYdG5!+#1k8%gKr-RUF7L-}2(^aT`%t}7>;%a)OxBK1Lp2UQFTT
zP!`fRsXb11anAwz^V+sDZNx&`Smz&yhxm+cC1O)^{d(hsc`!0x
z)8Gh)OV@V*TP3K)VE^0Iup{4$>jB*DFi6|)<`w80MO01otIOD!8HatD*V{>N<4}E=
zbGe6m`U4JC`OxE-W=F?8{5U_Us|(BWNbO$TBiP8Sd6Ukr*g79-9!erJkSJ*#L_?6)
zbe}WQMI$L#C+u}Wg{GZw1q*9zA`efo7hRx!@{6vi9lCL@u8m1=z4;68KBVV(`1|iy
zTxWl)@v_$zgyJxbTfw{l8&(fCbudwOasWHCnK(F^{Rnpyx&JK#!!!qoj8{?WroanY
zQn<#%IA9s2vzDHxwT77&l4D&W
z|BeY*#9Cn31;i{bogV0KIQ}pYz;q?4J)J5?N4Pd-Q*pf!DBIg2+o;JF*#_0q+I6mfgP=P;gGNh
z6tT6U1(%)c*~7kzdU4S06C*7GEoP4t`n(MC+qkb9xnrXpCNaY=zEfOobdgkKXteSn
zgoI`DPS)_E$ok|5_}CUnkwd=e_3PPnAq@n4Z5iF`8$86_+_H^7p04c4d9&`gaQY(`
z{Xr9fa4dj-_bkW17VKZw-#po&EcbVSzqg+MCHUhS5A%w@G@|bc{=JX)S3w_Gi{RgP
z_3q-_?eF|Ts(?MNa;MvKSNLv6;+OCv)Zg|e?gHGc5B&nz#rpr>{y$ZV?gHMe1N;I+
z$GHpmtGx1`qiDbyFMlfK&!6@4^0T&Z7v=6&`qyvC!uyHx?>5$Vf$nZcegWkX{sj89
zIk_u(m*oBu4JH2VKlw>{@1oqLWxr5JVKoAnKJJjRyW^iE?Jnrwna3|k03eza0Qd(F
mxhwwn?D1D|6wqJ9|I8)Ia)>au008d8UT