poi/src/ooxml/java/org/apache/poi/xssf/util/EvilUnclosedBRFixingInputSt...

210 lines
6.4 KiB
Java
Raw Normal View History

/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.util;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
/**
* This is a seriously sick fix for the fact that some .xlsx
* files contain raw bits of HTML, without being escaped
* or properly turned into XML.
* The result is that they contain things like >br<,
* which breaks the XML parsing.
* This very sick InputStream wrapper attempts to spot
* these go past, and fix them.
* Only works for UTF-8 and US-ASCII based streams!
* It should only be used where experience shows the problem
* can occur...
*/
public class EvilUnclosedBRFixingInputStream extends InputStream {
private InputStream source;
private byte[] spare;
private static byte[] detect = new byte[] {
(byte)'<', (byte)'b', (byte)'r', (byte)'>'
};
public EvilUnclosedBRFixingInputStream(InputStream source) {
this.source = source;
}
/**
* Warning - doesn't fix!
*/
@Override
public int read() throws IOException {
return source.read();
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
// Grab any data left from last time
int readA = readFromSpare(b, off, len);
// Now read from the stream
int readB = source.read(b, off+readA, len-readA);
// Figure out how much we've done
int read;
if(readB == -1 || readB == 0) {
if (readA == 0) {
return readB;
}
read = readA;
} else {
read = readA + readB;
}
// Fix up our data
if(read > 0) {
read = fixUp(b, off, read);
}
// All done
return read;
}
@Override
public int read(byte[] b) throws IOException {
return this.read(b, 0, b.length);
}
/**
* Reads into the buffer from the spare bytes
*/
private int readFromSpare(byte[] b, int offset, int len) {
if(spare == null) return 0;
if(len == 0) throw new IllegalArgumentException("Asked to read 0 bytes");
if(spare.length <= len) {
// All fits, good
System.arraycopy(spare, 0, b, offset, spare.length);
int read = spare.length;
spare = null;
return read;
} else {
// We have more spare than they can copy with...
byte[] newspare = new byte[spare.length-len];
System.arraycopy(spare, 0, b, offset, len);
System.arraycopy(spare, len, newspare, 0, newspare.length);
spare = newspare;
return len;
}
}
private void addToSpare(byte[] b, int offset, int len, boolean atTheEnd) {
if(spare == null) {
spare = new byte[len];
System.arraycopy(b, offset, spare, 0, len);
} else {
byte[] newspare = new byte[spare.length+len];
if(atTheEnd) {
System.arraycopy(spare, 0, newspare, 0, spare.length);
System.arraycopy(b, offset, newspare, spare.length, len);
} else {
System.arraycopy(b, offset, newspare, 0, len);
System.arraycopy(spare, 0, newspare, len, spare.length);
}
spare = newspare;
}
}
private int fixUp(byte[] b, int offset, int read) {
// Do we have any potential overhanging ones?
for(int i=0; i<detect.length-1; i++) {
int base = offset+read-1-i;
if(base < 0) continue;
boolean going = true;
for(int j=0; j<=i && going; j++) {
if(b[base+j] == detect[j]) {
// Matches
} else {
going = false;
}
}
if(going) {
// There could be a <br> handing over the end, eg <br|
addToSpare(b, base, i+1, true);
read -= 1;
read -= i;
break;
}
}
// Find places to fix
ArrayList<Integer> fixAt = new ArrayList<Integer>();
for(int i=offset; i<=offset+read-detect.length; i++) {
boolean going = true;
for(int j=0; j<detect.length && going; j++) {
if(b[i+j] != detect[j]) {
going = false;
}
}
if(going) {
fixAt.add(i);
}
}
if(fixAt.size()==0) {
return read;
}
// If there isn't space in the buffer to contain
// all the fixes, then save the overshoot for next time
int needed = offset+read+fixAt.size();
int overshoot = needed - b.length;
if(overshoot > 0) {
// Make sure we don't loose part of a <br>!
int fixes = 0;
for(int at : fixAt) {
if(at > offset+read-detect.length-overshoot-fixes) {
overshoot = needed - at - 1 - fixes;
break;
}
fixes++;
}
addToSpare(b, offset+read-overshoot, overshoot, false);
read -= overshoot;
}
// Fix them, in reverse order so the
// positions are valid
for(int j=fixAt.size()-1; j>=0; j--) {
int i = fixAt.get(j);
if(i >= read+offset) {
// This one has moved into the overshoot
continue;
}
if(i > read-3) {
// This one has moved into the overshoot
continue;
}
byte[] tmp = new byte[read-i-3];
System.arraycopy(b, i+3, tmp, 0, tmp.length);
b[i+3] = (byte)'/';
System.arraycopy(tmp, 0, b, i+4, tmp.length);
// It got one longer
read++;
}
return read;
}
}