Refactor the common LZW decompression code out into utils
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1051377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b31c0f88da
commit
d2583d83d5
178
src/java/org/apache/poi/util/LZWDecompresser.java
Normal file
178
src/java/org/apache/poi/util/LZWDecompresser.java
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.util;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class provides common functionality for the
|
||||||
|
* various LZW implementations in the different file
|
||||||
|
* formats.
|
||||||
|
* It's currently used by HDGF and HMEF.
|
||||||
|
*
|
||||||
|
* Two good resources on LZW are:
|
||||||
|
* http://en.wikipedia.org/wiki/LZW
|
||||||
|
* http://marknelson.us/1989/10/01/lzw-data-compression/
|
||||||
|
*/
|
||||||
|
public abstract class LZWDecompresser {
|
||||||
|
/**
|
||||||
|
* Does the mask bit mean it's compressed or uncompressed?
|
||||||
|
*/
|
||||||
|
private boolean maskMeansCompressed;
|
||||||
|
|
||||||
|
protected LZWDecompresser(boolean maskMeansCompressed) {
|
||||||
|
this.maskMeansCompressed = maskMeansCompressed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Populates the dictionary. May not need
|
||||||
|
* to do anything if all zeros is fine.
|
||||||
|
*/
|
||||||
|
protected abstract void populateDictionary(byte[] dict);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adjusts the position offset if needed when looking
|
||||||
|
* something up in the dictionary.
|
||||||
|
*/
|
||||||
|
protected abstract int adjustDictionaryOffset(int offset);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decompresses the given input stream, returning the array of bytes
|
||||||
|
* of the decompressed input.
|
||||||
|
*/
|
||||||
|
public byte[] decompress(InputStream src) throws IOException {
|
||||||
|
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
||||||
|
decompress(src,res);
|
||||||
|
return res.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform a streaming decompression of the input.
|
||||||
|
* Works by:
|
||||||
|
* 1) Reading a flag byte, the 8 bits of which tell you if the
|
||||||
|
* following 8 codes are compressed our un-compressed
|
||||||
|
* 2) Consider the 8 bits in turn
|
||||||
|
* 3) If the bit is set, the next code is un-compressed, so
|
||||||
|
* add it to the dictionary and output it
|
||||||
|
* 4) If the bit isn't set, then read in the length and start
|
||||||
|
* position in the dictionary, and output the bytes there
|
||||||
|
* 5) Loop until we've done all 8 bits, then read in the next
|
||||||
|
* flag byte
|
||||||
|
*/
|
||||||
|
public void decompress(InputStream src, OutputStream res) throws IOException {
|
||||||
|
// We use 12 bit codes:
|
||||||
|
// * 0-255 are real bytes
|
||||||
|
// * 256-4095 are the substring codes
|
||||||
|
// Java handily initialises our buffer / dictionary
|
||||||
|
// to all zeros
|
||||||
|
byte[] buffer = new byte[4096];
|
||||||
|
populateDictionary(buffer);
|
||||||
|
|
||||||
|
// How far through the output we've got
|
||||||
|
// (This is normally used &4095, so it nicely wraps)
|
||||||
|
int pos = 0;
|
||||||
|
// The flag byte is treated as its 8 individual
|
||||||
|
// bits, which tell us if the following 8 codes
|
||||||
|
// are compressed or un-compressed
|
||||||
|
int flag;
|
||||||
|
// The mask, between 1 and 255, which is used when
|
||||||
|
// processing each bit of the flag byte in turn
|
||||||
|
int mask;
|
||||||
|
|
||||||
|
// These are bytes as looked up in the dictionary
|
||||||
|
// It needs to be signed, as it'll get passed on to
|
||||||
|
// the output stream
|
||||||
|
byte[] dataB = new byte[19];
|
||||||
|
// This is an unsigned byte read from the stream
|
||||||
|
// It needs to be unsigned, so that bit stuff works
|
||||||
|
int dataI;
|
||||||
|
// The compressed code sequence is held over 2 bytes
|
||||||
|
int dataIPt1, dataIPt2;
|
||||||
|
// How long a code sequence is, and where in the
|
||||||
|
// dictionary to start at
|
||||||
|
int len, pntr;
|
||||||
|
|
||||||
|
while( (flag = src.read()) != -1 ) {
|
||||||
|
// Compare each bit in our flag byte in turn:
|
||||||
|
for(mask = 1; mask < 256 ; mask <<= 1) {
|
||||||
|
// Is this a new code (un-compressed), or
|
||||||
|
// the use of existing codes (compressed)?
|
||||||
|
boolean isMaskSet = (flag & mask) > 0;
|
||||||
|
if( isMaskSet && !maskMeansCompressed ) {
|
||||||
|
// Retrieve the un-compressed code
|
||||||
|
if( (dataI = src.read()) != -1) {
|
||||||
|
// Save the byte into the dictionary
|
||||||
|
buffer[(pos&4095)] = fromInt(dataI);
|
||||||
|
pos++;
|
||||||
|
// And output the byte
|
||||||
|
res.write( new byte[] {fromInt(dataI)} );
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We have a compressed sequence
|
||||||
|
// Grab the next 16 bits of data
|
||||||
|
dataIPt1 = src.read();
|
||||||
|
dataIPt2 = src.read();
|
||||||
|
if(dataIPt1 == -1 || dataIPt2 == -1) break;
|
||||||
|
|
||||||
|
// Build up how long the code sequence is, and
|
||||||
|
// what position of the code to start at
|
||||||
|
// (The position is the first 12 bits, the
|
||||||
|
// length is the last 4 bits)
|
||||||
|
len = (dataIPt2 & 15) + 3;
|
||||||
|
pntr = (dataIPt2 & 240)*16 + dataIPt1;
|
||||||
|
|
||||||
|
// Adjust the pointer as needed
|
||||||
|
pntr = adjustDictionaryOffset(pntr);
|
||||||
|
|
||||||
|
// Loop over the codes, outputting what they correspond to
|
||||||
|
for(int i=0; i<len; i++) {
|
||||||
|
dataB[i] = buffer[(pntr + i) & 4095];
|
||||||
|
buffer[ (pos + i) & 4095 ] = dataB[i];
|
||||||
|
}
|
||||||
|
res.write(dataB, 0, len);
|
||||||
|
|
||||||
|
// Record how far along the stream we have moved
|
||||||
|
pos = pos + len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given an integer, turn it into a java byte, handling
|
||||||
|
* the wrapping.
|
||||||
|
* This is a convenience method
|
||||||
|
*/
|
||||||
|
public static byte fromInt(int b) {
|
||||||
|
if(b < 128) return (byte)b;
|
||||||
|
return (byte)(b - 256);
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Given a java byte, turn it into an integer between 0
|
||||||
|
* and 255 (i.e. handle the unwrapping).
|
||||||
|
* This is a convenience method
|
||||||
|
*/
|
||||||
|
public static int fromByte(byte b) {
|
||||||
|
if(b >= 0) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
return b + 256;
|
||||||
|
}
|
||||||
|
}
|
@ -21,6 +21,8 @@ import java.io.IOException;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.util.LZWDecompresser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A decoder for the crazy LZW implementation used
|
* A decoder for the crazy LZW implementation used
|
||||||
* in Visio.
|
* in Visio.
|
||||||
@ -33,27 +35,10 @@ import java.io.OutputStream;
|
|||||||
* http://en.wikipedia.org/wiki/LZW
|
* http://en.wikipedia.org/wiki/LZW
|
||||||
* http://marknelson.us/1989/10/01/lzw-data-compression/
|
* http://marknelson.us/1989/10/01/lzw-data-compression/
|
||||||
*/
|
*/
|
||||||
public class HDGFLZW {
|
public class HDGFLZW extends LZWDecompresser {
|
||||||
|
public HDGFLZW() {
|
||||||
/**
|
// We're the wrong way round!
|
||||||
* Given an integer, turn it into a java byte, handling
|
super(false);
|
||||||
* the wrapping.
|
|
||||||
* This is a convenience method
|
|
||||||
*/
|
|
||||||
public static byte fromInt(int b) {
|
|
||||||
if(b < 128) return (byte)b;
|
|
||||||
return (byte)(b - 256);
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* Given a java byte, turn it into an integer between 0
|
|
||||||
* and 255 (i.e. handle the unwrapping).
|
|
||||||
* This is a convenience method
|
|
||||||
*/
|
|
||||||
public static int fromByte(byte b) {
|
|
||||||
if(b >= 0) {
|
|
||||||
return b;
|
|
||||||
}
|
|
||||||
return b + 256;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -67,108 +52,23 @@ public class HDGFLZW {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decompresses the given input stream, returning the array of bytes
|
* We have a slight shift by 18 bytes
|
||||||
* of the decompressed input.
|
|
||||||
*/
|
*/
|
||||||
public byte[] decode(InputStream src) throws IOException {
|
@Override
|
||||||
ByteArrayOutputStream res = new ByteArrayOutputStream();
|
protected int adjustDictionaryOffset(int pntr) {
|
||||||
decode(src,res);
|
if(pntr > 4078) {
|
||||||
return res.toByteArray();
|
pntr = pntr - 4078;
|
||||||
|
} else {
|
||||||
|
pntr = pntr + 18;
|
||||||
|
}
|
||||||
|
return pntr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Perform a streaming decompression of the input.
|
* We want an empty dictionary, so do nothing
|
||||||
* Works by:
|
|
||||||
* 1) Reading a flag byte, the 8 bits of which tell you if the
|
|
||||||
* following 8 codes are compressed our un-compressed
|
|
||||||
* 2) Consider the 8 bits in turn
|
|
||||||
* 3) If the bit is set, the next code is un-compressed, so
|
|
||||||
* add it to the dictionary and output it
|
|
||||||
* 4) If the bit isn't set, then read in the length and start
|
|
||||||
* position in the dictionary, and output the bytes there
|
|
||||||
* 5) Loop until we've done all 8 bits, then read in the next
|
|
||||||
* flag byte
|
|
||||||
*/
|
*/
|
||||||
public void decode(InputStream src, OutputStream res) throws IOException {
|
@Override
|
||||||
// We use 12 bit codes:
|
protected void populateDictionary(byte[] dict) {
|
||||||
// * 0-255 are real bytes
|
|
||||||
// * 256-4095 are the substring codes
|
|
||||||
// Java handily initialises our buffer / dictionary
|
|
||||||
// to all zeros
|
|
||||||
byte[] buffer = new byte[4096];
|
|
||||||
|
|
||||||
// How far through the output we've got
|
|
||||||
// (This is normally used &4095, so it nicely wraps)
|
|
||||||
int pos = 0;
|
|
||||||
// The flag byte is treated as its 8 individual
|
|
||||||
// bits, which tell us if the following 8 codes
|
|
||||||
// are compressed or un-compressed
|
|
||||||
int flag;
|
|
||||||
// The mask, between 1 and 255, which is used when
|
|
||||||
// processing each bit of the flag byte in turn
|
|
||||||
int mask;
|
|
||||||
|
|
||||||
// These are bytes as looked up in the dictionary
|
|
||||||
// It needs to be signed, as it'll get passed on to
|
|
||||||
// the output stream
|
|
||||||
byte[] dataB = new byte[19];
|
|
||||||
// This is an unsigned byte read from the stream
|
|
||||||
// It needs to be unsigned, so that bit stuff works
|
|
||||||
int dataI;
|
|
||||||
// The compressed code sequence is held over 2 bytes
|
|
||||||
int dataIPt1, dataIPt2;
|
|
||||||
// How long a code sequence is, and where in the
|
|
||||||
// dictionary to start at
|
|
||||||
int len, pntr;
|
|
||||||
|
|
||||||
while( (flag = src.read()) != -1 ) {
|
|
||||||
// Compare each bit in our flag byte in turn:
|
|
||||||
for(mask = 1; mask < 256 ; mask <<= 1) {
|
|
||||||
// Is this a new code (un-compressed), or
|
|
||||||
// the use of existing codes (compressed)?
|
|
||||||
if( (flag & mask) > 0 ) {
|
|
||||||
// Retrieve the un-compressed code
|
|
||||||
if( (dataI = src.read()) != -1) {
|
|
||||||
// Save the byte into the dictionary
|
|
||||||
buffer[(pos&4095)] = fromInt(dataI);
|
|
||||||
pos++;
|
|
||||||
// And output the byte
|
|
||||||
res.write( new byte[] {fromInt(dataI)} );
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// We have a compressed sequence
|
|
||||||
// Grab the next 16 bits of data
|
|
||||||
dataIPt1 = src.read();
|
|
||||||
dataIPt2 = src.read();
|
|
||||||
if(dataIPt1 == -1 || dataIPt2 == -1) break;
|
|
||||||
|
|
||||||
// Build up how long the code sequence is, and
|
|
||||||
// what position of the code to start at
|
|
||||||
// (The position is the first 12 bits, the
|
|
||||||
// length is the last 4 bits)
|
|
||||||
len = (dataIPt2 & 15) + 3;
|
|
||||||
pntr = (dataIPt2 & 240)*16 + dataIPt1;
|
|
||||||
|
|
||||||
// If the pointer happens to be passed the end
|
|
||||||
// of our buffer, then wrap around
|
|
||||||
if(pntr > 4078) {
|
|
||||||
pntr = pntr - 4078;
|
|
||||||
} else {
|
|
||||||
pntr = pntr + 18;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loop over the codes, outputting what they correspond to
|
|
||||||
for(int i=0; i<len; i++) {
|
|
||||||
dataB[i] = buffer[(pntr + i) & 4095];
|
|
||||||
buffer[ (pos + i) & 4095 ] = dataB[i];
|
|
||||||
}
|
|
||||||
res.write(dataB, 0, len);
|
|
||||||
|
|
||||||
// Record how far along the stream we have moved
|
|
||||||
pos = pos + len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -78,7 +78,7 @@ public final class CompressedStreamStore extends StreamStore {
|
|||||||
|
|
||||||
// Decompress
|
// Decompress
|
||||||
HDGFLZW lzw = new HDGFLZW();
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
byte[] decompressed = lzw.decode(bais);
|
byte[] decompressed = lzw.decompress(bais);
|
||||||
|
|
||||||
// Split into header and contents
|
// Split into header and contents
|
||||||
byte[][] ret = new byte[2][];
|
byte[][] ret = new byte[2][];
|
||||||
|
@ -139,9 +139,9 @@ public final class TestHDGFLZW extends TestCase {
|
|||||||
assertEquals(339, testTrailerComp.length);
|
assertEquals(339, testTrailerComp.length);
|
||||||
assertEquals(632, testTrailerDecomp.length);
|
assertEquals(632, testTrailerDecomp.length);
|
||||||
|
|
||||||
// Decode it using our engine
|
// decompress it using our engine
|
||||||
HDGFLZW lzw = new HDGFLZW();
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
|
byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
|
||||||
|
|
||||||
// Check it's of the right size
|
// Check it's of the right size
|
||||||
assertEquals(632, dec.length);
|
assertEquals(632, dec.length);
|
||||||
@ -159,9 +159,9 @@ public final class TestHDGFLZW extends TestCase {
|
|||||||
assertEquals(339, testTrailerComp.length);
|
assertEquals(339, testTrailerComp.length);
|
||||||
assertEquals(632, testTrailerDecomp.length);
|
assertEquals(632, testTrailerDecomp.length);
|
||||||
|
|
||||||
// Decode it using our engine
|
// decompress it using our engine
|
||||||
HDGFLZW lzw = new HDGFLZW();
|
HDGFLZW lzw = new HDGFLZW();
|
||||||
byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
|
byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
|
||||||
|
|
||||||
// Now check it's the right data
|
// Now check it's the right data
|
||||||
assertEquals(632, dec.length);
|
assertEquals(632, dec.length);
|
||||||
@ -188,7 +188,7 @@ public final class TestHDGFLZW extends TestCase {
|
|||||||
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
|
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
|
||||||
|
|
||||||
// Now decompress it again
|
// Now decompress it again
|
||||||
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
// First up, check the round tripping
|
// First up, check the round tripping
|
||||||
assertEquals(12, decomp.length);
|
assertEquals(12, decomp.length);
|
||||||
@ -223,7 +223,7 @@ public final class TestHDGFLZW extends TestCase {
|
|||||||
assertEquals(27, comp.length);
|
assertEquals(27, comp.length);
|
||||||
|
|
||||||
// Now decompress it again
|
// Now decompress it again
|
||||||
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
// We can only check the round-tripping, as for now
|
// We can only check the round-tripping, as for now
|
||||||
// visio cheats on re-using a block
|
// visio cheats on re-using a block
|
||||||
@ -246,7 +246,7 @@ public final class TestHDGFLZW extends TestCase {
|
|||||||
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
|
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
|
||||||
|
|
||||||
// Now decompress it again
|
// Now decompress it again
|
||||||
byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
|
byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
|
||||||
|
|
||||||
// for(int i=0; i<comp.length; i++) {
|
// for(int i=0; i<comp.length; i++) {
|
||||||
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
|
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
|
||||||
|
Loading…
Reference in New Issue
Block a user