A bit more on HDGF LZW compression, but it's still not quite complete

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-10-27 22:50:41 +00:00
parent 90b1f22b4d
commit a1ed3f51e4

View File

@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
/** /**
* Performs the Visio compatible streaming LZW compression. * Performs the Visio compatible streaming LZW compression.
* Works by:
* 1) ???
* 2) ???
* TODO - Finish * TODO - Finish
*/ */
public void compress(InputStream src, OutputStream res) throws IOException { public void compress(InputStream src, OutputStream res) throws IOException {
Compressor c = new Compressor();
c.compress(src, res);
}
/**
* Helper class to handle the Visio compatible
* streaming LZW compression.
* Need our own class to handle keeping track of the
* code buffer, pending bytes to write out etc.
*/
private class Compressor {
// We use 12 bit codes: // We use 12 bit codes:
// * 0-255 are real bytes // * 0-255 are real bytes
// * 256-4095 are the substring codes // * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary // Java handily initialises our buffer / dictionary
// to all zeros // to all zeros
byte[] dict = new byte[4096]; byte[] dict = new byte[4096];
// The next block of data to be written out, minus // The next block of data to be written out, minus
// its mask byte // its mask byte
byte[] buffer = new byte[16]; byte[] buffer = new byte[16];
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
// are two) // are two)
int bufferLen = 0; int bufferLen = 0;
// The raw length of a code is limited to 4 bits
byte[] rawCode = new byte[16];
// And how much we're using
int rawCodeLen = 0;
// How far through the input and output streams we are // How far through the input and output streams we are
int posInp = 0; int posInp = 0;
int posOut = 0; int posOut = 0;
@ -199,48 +213,85 @@ public void compress(InputStream src, OutputStream res) throws IOException {
// And how many bits we've already set // And how many bits we've already set
int maskBitsSet = 0; int maskBitsSet = 0;
// This is a byte as looked up in the dictionary /**
// It needs to be signed, as it'll get passed on to * Returns the last place that the bytes from rawCode are found
// the output stream * at in the buffer, or -1 if they can't be found
byte dataB; */
// This is an unsigned byte read from the stream private int findRawCodeInBuffer() {
// It needs to be unsigned, so that bit stuff works // Work our way back from the end
int dataI; // (Visio always seems to use the last possible code)
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
boolean matches = true;
for(int j=0; matches && j<rawCodeLen; j++) {
if(buffer[i] == rawCode[j]) {
// Fits
} else {
// Doesn't fit, can't be a match
matches = false;
}
}
// Have we hit the end of the file yet? // Was this position a match?
boolean going = true; if(matches) {
return i;
}
}
while( going ) { // Not found
dataI = src.read(); return -1;
posInp++; }
if(dataI == -1) { going = false; }
// Decide if we're going to output uncompressed or compressed /**
// for this byte * Output the compressed representation for the bytes
// (It takes 2 bytes to hold a compressed code, so it's only * found in rawCode
// worth doing for 3+ byte long sequences) */
private void outputCompressed(OutputStream res) throws IOException {
// It's not worth compressing only 1 or two bytes,
// due to the overheads
// So if asked, just output uncompressed
if(rawCodeLen < 3) {
for(int i=0; i<rawCodeLen; i++) {
outputUncompressed(rawCode[i], res);
}
return;
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// Add the length+code to the buffer
// TODO // TODO
posOut += 2;
boolean compressThis = true; // If we're now at 8 codes, output
if(compressThis) { if(maskBitsSet == 8) {
output8Codes(res);
}
}
/**
* Output the un-compressed byte
*/
private void outputUncompressed(byte b, OutputStream res) throws IOException {
// Set the mask bit for us // Set the mask bit for us
nextMask += (1<<maskBitsSet); nextMask += (1<<maskBitsSet);
// And add us to the buffer + dictionary // And add us to the buffer + dictionary
buffer[bufferLen] = fromInt(dataI); buffer[bufferLen] = fromInt(b);
bufferLen++; bufferLen++;
dict[(posOut&4095)] = fromInt(dataI); dict[(posOut&4095)] = fromInt(b);
posOut++; posOut++;
} else {
// ????
}
// Increment the mask bit count, we've done another code
maskBitsSet++;
// If we've just done the 8th bit, or reached the end // If we're now at 8 codes, output
// of the stream, output our mask and data if(maskBitsSet == 8) {
if(maskBitsSet == 8 || !going) { output8Codes(res);
// Output }
}
/**
* We've got 8 code worth to write out, so
* output along with the header
*/
private void output8Codes(OutputStream res) throws IOException {
// Output the mask and the data
res.write(new byte[] { fromInt(nextMask) } ); res.write(new byte[] { fromInt(nextMask) } );
res.write(buffer, 0, bufferLen); res.write(buffer, 0, bufferLen);
@ -249,6 +300,83 @@ public void compress(InputStream src, OutputStream res) throws IOException {
maskBitsSet = 0; maskBitsSet = 0;
bufferLen = 0; bufferLen = 0;
} }
/**
* Does the compression
*/
private void compress(InputStream src, OutputStream res) throws IOException {
// Have we hit the end of the file yet?
boolean going = true;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
dataB = fromInt(dataI);
// If we've run out of data, output anything that's
// pending then finish
if(!going && rawCodeLen > 0) {
outputCompressed(res);
break;
}
// Try adding this new byte onto rawCode, and
// see if all of that is still found in the
// buffer dictionary or not
rawCode[rawCodeLen] = dataB;
rawCodeLen++;
int rawAt = findRawCodeInBuffer();
// If we found it and are now at 16 bytes,
// we need to output our pending code block
if(rawCodeLen == 16 && rawAt > -1) {
outputCompressed(res);
rawCodeLen = 0;
continue;
}
// If we did find all of rawCode with our new
// byte added on, we can wait to see what happens
// with the next byte
if(rawAt > -1) {
continue;
}
// If there was something in rawCode before, then we
// need to output that
rawCodeLen--;
if(rawCodeLen > 0) {
// Output the old rawCode
outputCompressed(res);
// Can this byte start a new rawCode, or does
// it need outputting itself?
rawCode[0] = dataB;
rawCodeLen = 1;
if(findRawCodeInBuffer() > -1) {
// Fits in, wait for next byte
continue;
} else {
// Doesn't fit, output
outputUncompressed(dataB,res);
rawCodeLen = 0;
}
} else {
// Nothing in rawCode before, so this byte
// isn't in the buffer dictionary
// Output it un-compressed
outputUncompressed(dataB,res);
}
}
} }
} }