A bit more on HDGF LZW compression, but it's still not quite complete
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
90b1f22b4d
commit
a1ed3f51e4
@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
|
||||
|
||||
/**
|
||||
* Performs the Visio compatible streaming LZW compression.
|
||||
* Works by:
|
||||
* 1) ???
|
||||
* 2) ???
|
||||
* TODO - Finish
|
||||
*/
|
||||
public void compress(InputStream src, OutputStream res) throws IOException {
|
||||
Compressor c = new Compressor();
|
||||
c.compress(src, res);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper class to handle the Visio compatible
|
||||
* streaming LZW compression.
|
||||
* Need our own class to handle keeping track of the
|
||||
* code buffer, pending bytes to write out etc.
|
||||
*/
|
||||
private class Compressor {
|
||||
// We use 12 bit codes:
|
||||
// * 0-255 are real bytes
|
||||
// * 256-4095 are the substring codes
|
||||
// Java handily initialises our buffer / dictionary
|
||||
// to all zeros
|
||||
byte[] dict = new byte[4096];
|
||||
|
||||
// The next block of data to be written out, minus
|
||||
// its mask byte
|
||||
byte[] buffer = new byte[16];
|
||||
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
||||
// are two)
|
||||
int bufferLen = 0;
|
||||
|
||||
// The raw length of a code is limited to 4 bits
|
||||
byte[] rawCode = new byte[16];
|
||||
// And how much we're using
|
||||
int rawCodeLen = 0;
|
||||
|
||||
// How far through the input and output streams we are
|
||||
int posInp = 0;
|
||||
int posOut = 0;
|
||||
@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
||||
// And how many bits we've already set
|
||||
int maskBitsSet = 0;
|
||||
|
||||
/**
|
||||
* Returns the last place that the bytes from rawCode are found
|
||||
* at in the buffer, or -1 if they can't be found
|
||||
*/
|
||||
private int findRawCodeInBuffer() {
|
||||
// Work our way back from the end
|
||||
// (Visio always seems to use the last possible code)
|
||||
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
|
||||
boolean matches = true;
|
||||
for(int j=0; matches && j<rawCodeLen; j++) {
|
||||
if(buffer[i] == rawCode[j]) {
|
||||
// Fits
|
||||
} else {
|
||||
// Doesn't fit, can't be a match
|
||||
matches = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Was this position a match?
|
||||
if(matches) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Not found
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output the compressed representation for the bytes
|
||||
* found in rawCode
|
||||
*/
|
||||
private void outputCompressed(OutputStream res) throws IOException {
|
||||
// It's not worth compressing only 1 or two bytes,
|
||||
// due to the overheads
|
||||
// So if asked, just output uncompressed
|
||||
if(rawCodeLen < 3) {
|
||||
for(int i=0; i<rawCodeLen; i++) {
|
||||
outputUncompressed(rawCode[i], res);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Increment the mask bit count, we've done another code
|
||||
maskBitsSet++;
|
||||
// Add the length+code to the buffer
|
||||
// TODO
|
||||
posOut += 2;
|
||||
|
||||
// If we're now at 8 codes, output
|
||||
if(maskBitsSet == 8) {
|
||||
output8Codes(res);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Output the un-compressed byte
|
||||
*/
|
||||
private void outputUncompressed(byte b, OutputStream res) throws IOException {
|
||||
// Set the mask bit for us
|
||||
nextMask += (1<<maskBitsSet);
|
||||
|
||||
// And add us to the buffer + dictionary
|
||||
buffer[bufferLen] = fromInt(b);
|
||||
bufferLen++;
|
||||
dict[(posOut&4095)] = fromInt(b);
|
||||
posOut++;
|
||||
|
||||
// If we're now at 8 codes, output
|
||||
if(maskBitsSet == 8) {
|
||||
output8Codes(res);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* We've got 8 code worth to write out, so
|
||||
* output along with the header
|
||||
*/
|
||||
private void output8Codes(OutputStream res) throws IOException {
|
||||
// Output the mask and the data
|
||||
res.write(new byte[] { fromInt(nextMask) } );
|
||||
res.write(buffer, 0, bufferLen);
|
||||
|
||||
// Reset things
|
||||
nextMask = 0;
|
||||
maskBitsSet = 0;
|
||||
bufferLen = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the compression
|
||||
*/
|
||||
private void compress(InputStream src, OutputStream res) throws IOException {
|
||||
// Have we hit the end of the file yet?
|
||||
boolean going = true;
|
||||
|
||||
// This is a byte as looked up in the dictionary
|
||||
// It needs to be signed, as it'll get passed on to
|
||||
// the output stream
|
||||
@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
||||
// It needs to be unsigned, so that bit stuff works
|
||||
int dataI;
|
||||
|
||||
// Have we hit the end of the file yet?
|
||||
boolean going = true;
|
||||
|
||||
while( going ) {
|
||||
dataI = src.read();
|
||||
posInp++;
|
||||
if(dataI == -1) { going = false; }
|
||||
dataB = fromInt(dataI);
|
||||
|
||||
// Decide if we're going to output uncompressed or compressed
|
||||
// for this byte
|
||||
// (It takes 2 bytes to hold a compressed code, so it's only
|
||||
// worth doing for 3+ byte long sequences)
|
||||
// TODO
|
||||
|
||||
boolean compressThis = true;
|
||||
if(compressThis) {
|
||||
// Set the mask bit for us
|
||||
nextMask += (1<<maskBitsSet);
|
||||
|
||||
// And add us to the buffer + dictionary
|
||||
buffer[bufferLen] = fromInt(dataI);
|
||||
bufferLen++;
|
||||
dict[(posOut&4095)] = fromInt(dataI);
|
||||
posOut++;
|
||||
} else {
|
||||
// ????
|
||||
// If we've run out of data, output anything that's
|
||||
// pending then finish
|
||||
if(!going && rawCodeLen > 0) {
|
||||
outputCompressed(res);
|
||||
break;
|
||||
}
|
||||
// Increment the mask bit count, we've done another code
|
||||
maskBitsSet++;
|
||||
|
||||
// Try adding this new byte onto rawCode, and
|
||||
// see if all of that is still found in the
|
||||
// buffer dictionary or not
|
||||
rawCode[rawCodeLen] = dataB;
|
||||
rawCodeLen++;
|
||||
int rawAt = findRawCodeInBuffer();
|
||||
|
||||
// If we've just done the 8th bit, or reached the end
|
||||
// of the stream, output our mask and data
|
||||
if(maskBitsSet == 8 || !going) {
|
||||
// Output
|
||||
res.write(new byte[] { fromInt(nextMask) } );
|
||||
res.write(buffer, 0, bufferLen);
|
||||
// If we found it and are now at 16 bytes,
|
||||
// we need to output our pending code block
|
||||
if(rawCodeLen == 16 && rawAt > -1) {
|
||||
outputCompressed(res);
|
||||
rawCodeLen = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we did find all of rawCode with our new
|
||||
// byte added on, we can wait to see what happens
|
||||
// with the next byte
|
||||
if(rawAt > -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If there was something in rawCode before, then we
|
||||
// need to output that
|
||||
rawCodeLen--;
|
||||
if(rawCodeLen > 0) {
|
||||
// Output the old rawCode
|
||||
outputCompressed(res);
|
||||
|
||||
// Reset things
|
||||
nextMask = 0;
|
||||
maskBitsSet = 0;
|
||||
bufferLen = 0;
|
||||
// Can this byte start a new rawCode, or does
|
||||
// it need outputting itself?
|
||||
rawCode[0] = dataB;
|
||||
rawCodeLen = 1;
|
||||
if(findRawCodeInBuffer() > -1) {
|
||||
// Fits in, wait for next byte
|
||||
continue;
|
||||
} else {
|
||||
// Doesn't fit, output
|
||||
outputUncompressed(dataB,res);
|
||||
rawCodeLen = 0;
|
||||
}
|
||||
} else {
|
||||
// Nothing in rawCode before, so this byte
|
||||
// isn't in the buffer dictionary
|
||||
// Output it un-compressed
|
||||
outputUncompressed(dataB,res);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user