A bit more on HDGF LZW compression, but it's still not quite complete

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-10-27 22:50:41 +00:00
parent 90b1f22b4d
commit a1ed3f51e4

View File

@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
/**
* Performs the Visio compatible streaming LZW compression.
* Works by:
* 1) ???
* 2) ???
* TODO - Finish
*/
public void compress(InputStream src, OutputStream res) throws IOException {
Compressor c = new Compressor();
c.compress(src, res);
}
/**
* Helper class to handle the Visio compatible
* streaming LZW compression.
* Need our own class to handle keeping track of the
* code buffer, pending bytes to write out etc.
*/
private class Compressor {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] dict = new byte[4096];
// The next block of data to be written out, minus
// its mask byte
byte[] buffer = new byte[16];
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
// are two)
int bufferLen = 0;
// The raw length of a code is limited to 4 bits
byte[] rawCode = new byte[16];
// And how much we're using
int rawCodeLen = 0;
// How far through the input and output streams we are
int posInp = 0;
int posOut = 0;
@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
// And how many bits we've already set
int maskBitsSet = 0;
/**
* Returns the last place that the bytes from rawCode are found
* at in the buffer, or -1 if they can't be found
*/
private int findRawCodeInBuffer() {
// Work our way back from the end
// (Visio always seems to use the last possible code)
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
boolean matches = true;
for(int j=0; matches && j<rawCodeLen; j++) {
if(buffer[i] == rawCode[j]) {
// Fits
} else {
// Doesn't fit, can't be a match
matches = false;
}
}
// Was this position a match?
if(matches) {
return i;
}
}
// Not found
return -1;
}
/**
* Output the compressed representation for the bytes
* found in rawCode
*/
private void outputCompressed(OutputStream res) throws IOException {
// It's not worth compressing only 1 or two bytes,
// due to the overheads
// So if asked, just output uncompressed
if(rawCodeLen < 3) {
for(int i=0; i<rawCodeLen; i++) {
outputUncompressed(rawCode[i], res);
}
return;
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Add the length+code to the buffer
// TODO
posOut += 2;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* Output the un-compressed byte
*/
private void outputUncompressed(byte b, OutputStream res) throws IOException {
// Set the mask bit for us
nextMask += (1<<maskBitsSet);
// And add us to the buffer + dictionary
buffer[bufferLen] = fromInt(b);
bufferLen++;
dict[(posOut&4095)] = fromInt(b);
posOut++;
// If we're now at 8 codes, output
if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* We've got 8 code worth to write out, so
* output along with the header
*/
private void output8Codes(OutputStream res) throws IOException {
// Output the mask and the data
res.write(new byte[] { fromInt(nextMask) } );
res.write(buffer, 0, bufferLen);
// Reset things
nextMask = 0;
maskBitsSet = 0;
bufferLen = 0;
}
/**
* Does the compression
*/
private void compress(InputStream src, OutputStream res) throws IOException {
// Have we hit the end of the file yet?
boolean going = true;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
// It needs to be unsigned, so that bit stuff works
int dataI;
// Have we hit the end of the file yet?
boolean going = true;
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
dataB = fromInt(dataI);
// Decide if we're going to output uncompressed or compressed
// for this byte
// (It takes 2 bytes to hold a compressed code, so it's only
// worth doing for 3+ byte long sequences)
// TODO
boolean compressThis = true;
if(compressThis) {
// Set the mask bit for us
nextMask += (1<<maskBitsSet);
// And add us to the buffer + dictionary
buffer[bufferLen] = fromInt(dataI);
bufferLen++;
dict[(posOut&4095)] = fromInt(dataI);
posOut++;
} else {
// ????
// If we've run out of data, output anything that's
// pending then finish
if(!going && rawCodeLen > 0) {
outputCompressed(res);
break;
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Try adding this new byte onto rawCode, and
// see if all of that is still found in the
// buffer dictionary or not
rawCode[rawCodeLen] = dataB;
rawCodeLen++;
int rawAt = findRawCodeInBuffer();
// If we've just done the 8th bit, or reached the end
// of the stream, output our mask and data
if(maskBitsSet == 8 || !going) {
// Output
res.write(new byte[] { fromInt(nextMask) } );
res.write(buffer, 0, bufferLen);
// If we found it and are now at 16 bytes,
// we need to output our pending code block
if(rawCodeLen == 16 && rawAt > -1) {
outputCompressed(res);
rawCodeLen = 0;
continue;
}
// If we did find all of rawCode with our new
// byte added on, we can wait to see what happens
// with the next byte
if(rawAt > -1) {
continue;
}
// If there was something in rawCode before, then we
// need to output that
rawCodeLen--;
if(rawCodeLen > 0) {
// Output the old rawCode
outputCompressed(res);
// Reset things
nextMask = 0;
maskBitsSet = 0;
bufferLen = 0;
// Can this byte start a new rawCode, or does
// it need outputting itself?
rawCode[0] = dataB;
rawCodeLen = 1;
if(findRawCodeInBuffer() > -1) {
// Fits in, wait for next byte
continue;
} else {
// Doesn't fit, output
outputUncompressed(dataB,res);
rawCodeLen = 0;
}
} else {
// Nothing in rawCode before, so this byte
// isn't in the buffer dictionary
// Output it un-compressed
outputUncompressed(dataB,res);
}
}
}
}
}