Fix bug #49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@941399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-05-05 17:49:59 +00:00
parent 7b043c2bc4
commit ede8beb227
6 changed files with 215 additions and 1 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.7-SNAPSHOT" date="2010-??-??"> <release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="fix">49020 - Workaround Excel outputting invalid XML in button definitions by not closing BR tags</action>
<action dev="POI-DEVELOPERS" type="fix">49050 - Improve performance of AbstractEscherHolderRecord when there are lots of Continue Records</action> <action dev="POI-DEVELOPERS" type="fix">49050 - Improve performance of AbstractEscherHolderRecord when there are lots of Continue Records</action>
<action dev="POI-DEVELOPERS" type="fix">49194 - Correct text size limit for OOXML .xlsx files</action> <action dev="POI-DEVELOPERS" type="fix">49194 - Correct text size limit for OOXML .xlsx files</action>
<action dev="POI-DEVELOPERS" type="fix">49254 - Fix CellUtils.setFont to use the correct type internally</action> <action dev="POI-DEVELOPERS" type="fix">49254 - Fix CellUtils.setFont to use the correct type internally</action>

View File

@ -20,6 +20,7 @@ package org.apache.poi.xssf.usermodel;
import org.apache.poi.POIXMLDocumentPart; import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.xssf.util.EvilUnclosedBRFixingInputStream;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions; import org.apache.xmlbeans.XmlOptions;
import org.apache.xmlbeans.XmlObject; import org.apache.xmlbeans.XmlObject;
@ -54,6 +55,11 @@ import schemasMicrosoftComOfficeExcel.STObjectType;
* need a file format for drawings are strongly encouraged to use preferentially DrawingML * need a file format for drawings are strongly encouraged to use preferentially DrawingML
* </p> * </p>
* *
* <p>
* Warning - Excel is known to put invalid XML into these files!
* For example, &gt;br&lt; without being closed or escaped crops up.
* </p>
*
* See 6.4 VML - SpreadsheetML Drawing in Office Open XML Part 4 - Markup Language Reference.pdf * See 6.4 VML - SpreadsheetML Drawing in Office Open XML Part 4 - Markup Language Reference.pdf
* *
* @author Yegor Kozlov * @author Yegor Kozlov
@ -98,7 +104,9 @@ public final class XSSFVMLDrawing extends POIXMLDocumentPart {
protected void read(InputStream is) throws IOException, XmlException { protected void read(InputStream is) throws IOException, XmlException {
XmlObject root = XmlObject.Factory.parse(is); XmlObject root = XmlObject.Factory.parse(
new EvilUnclosedBRFixingInputStream(is)
);
_qnames = new ArrayList<QName>(); _qnames = new ArrayList<QName>();
_items = new ArrayList<XmlObject>(); _items = new ArrayList<XmlObject>();

View File

@ -0,0 +1,116 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.util;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
/**
* This is a seriously sick fix for the fact that some .xlsx
* files contain raw bits of HTML, without being escaped
* or properly turned into XML.
* The result is that they contain things like &gt;br&lt;,
* which breaks the XML parsing.
* This very sick InputStream wrapper attempts to spot
* these go past, and fix them.
* Only works for UTF-8 and US-ASCII based streams!
* It should only be used where experience shows the problem
* can occur...
*/
public class EvilUnclosedBRFixingInputStream extends InputStream {
private InputStream source;
private byte[] spare;
private static byte[] detect = new byte[] {
(byte)'<', (byte)'b', (byte)'r', (byte)'>'
};
public EvilUnclosedBRFixingInputStream(InputStream source) {
this.source = source;
}
/**
* Warning - doesn't fix!
*/
@Override
public int read() throws IOException {
return source.read();
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if(spare != null) {
// This is risky, but spare is normally only a byte or two...
System.arraycopy(spare, 0, b, off, spare.length);
int ret = spare.length;
spare = null;
return ret;
}
int read = source.read(b, off, len);
read = fixUp(b, off, read);
return read;
}
@Override
public int read(byte[] b) throws IOException {
return this.read(b, 0, b.length);
}
private int fixUp(byte[] b, int offset, int read) {
// Find places to fix
ArrayList<Integer> fixAt = new ArrayList<Integer>();
for(int i=offset; i<offset+read-4; i++) {
boolean going = true;
for(int j=0; j<detect.length && going; j++) {
if(b[i+j] != detect[j]) {
going = false;
}
}
if(going) {
fixAt.add(i);
}
}
if(fixAt.size()==0) {
return read;
}
// Save a bit, if needed to fit
int overshoot = offset+read+fixAt.size() - b.length;
if(overshoot > 0) {
spare = new byte[overshoot];
System.arraycopy(b, b.length-overshoot, spare, 0, overshoot);
read -= overshoot;
}
// Fix them, in reverse order so the
// positions are valid
for(int j=fixAt.size()-1; j>=0; j--) {
int i = fixAt.get(j);
byte[] tmp = new byte[read-i-3];
System.arraycopy(b, i+3, tmp, 0, tmp.length);
b[i+3] = (byte)'/';
System.arraycopy(tmp, 0, b, i+4, tmp.length);
// It got one longer
read++;
}
return read;
}
}

View File

@ -138,4 +138,14 @@ public final class TestXSSFBugs extends BaseTestBugzillaIssues {
assertEquals(1, rels.size()); assertEquals(1, rels.size());
assertEquals("Sheet1!A1", rels.get(0).getPackageRelationship().getTargetURI().getFragment()); assertEquals("Sheet1!A1", rels.get(0).getPackageRelationship().getTargetURI().getFragment());
} }
/**
* Excel will sometimes write a button with a textbox
* containing &gt;br&lt; (not closed!).
* Clearly Excel shouldn't do this, but test that we can
* read the file despite the naughtyness
*/
public void test49020() throws Exception {
XSSFWorkbook wb = XSSFTestDataSamples.openSampleWorkbook("BrNotClosed.xlsx");
}
} }

View File

@ -0,0 +1,79 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import junit.framework.TestCase;
public final class TestEvilUnclosedBRFixingInputStream extends TestCase {
public void testOK() throws Exception {
byte[] ok = "<p><div>Hello There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
new ByteArrayInputStream(ok)
);
ByteArrayOutputStream bout = new ByteArrayOutputStream();
boolean going = true;
while(going) {
byte[] b = new byte[1024];
int r = inp.read(b);
if(r > 0) {
bout.write(b, 0, r);
} else {
going = false;
}
}
byte[] result = bout.toByteArray();
assertEquals(ok, result);
}
public void testProblem() throws Exception {
byte[] orig = "<p><div>Hello<br>There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
byte[] fixed = "<p><div>Hello<br/>There!</div> <div>Tags!</div></p>".getBytes("UTF-8");
EvilUnclosedBRFixingInputStream inp = new EvilUnclosedBRFixingInputStream(
new ByteArrayInputStream(orig)
);
ByteArrayOutputStream bout = new ByteArrayOutputStream();
boolean going = true;
while(going) {
byte[] b = new byte[1024];
int r = inp.read(b);
if(r > 0) {
bout.write(b, 0, r);
} else {
going = false;
}
}
byte[] result = bout.toByteArray();
assertEquals(fixed, result);
}
protected void assertEquals(byte[] a, byte[] b) {
assertEquals(a.length, b.length);
for(int i=0; i<a.length; i++) {
assertEquals("Wrong byte at index " + i, a[i], b[i]);
}
}
}

Binary file not shown.