A bit more on HDGF LZW compression, but it's still not quite complete
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
90b1f22b4d
commit
a1ed3f51e4
@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Performs the Visio compatible streaming LZW compression.
|
* Performs the Visio compatible streaming LZW compression.
|
||||||
* Works by:
|
|
||||||
* 1) ???
|
|
||||||
* 2) ???
|
|
||||||
* TODO - Finish
|
* TODO - Finish
|
||||||
*/
|
*/
|
||||||
public void compress(InputStream src, OutputStream res) throws IOException {
|
public void compress(InputStream src, OutputStream res) throws IOException {
|
||||||
|
Compressor c = new Compressor();
|
||||||
|
c.compress(src, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper class to handle the Visio compatible
|
||||||
|
* streaming LZW compression.
|
||||||
|
* Need our own class to handle keeping track of the
|
||||||
|
* code buffer, pending bytes to write out etc.
|
||||||
|
*/
|
||||||
|
private class Compressor {
|
||||||
// We use 12 bit codes:
|
// We use 12 bit codes:
|
||||||
// * 0-255 are real bytes
|
// * 0-255 are real bytes
|
||||||
// * 256-4095 are the substring codes
|
// * 256-4095 are the substring codes
|
||||||
// Java handily initialises our buffer / dictionary
|
// Java handily initialises our buffer / dictionary
|
||||||
// to all zeros
|
// to all zeros
|
||||||
byte[] dict = new byte[4096];
|
byte[] dict = new byte[4096];
|
||||||
|
|
||||||
// The next block of data to be written out, minus
|
// The next block of data to be written out, minus
|
||||||
// its mask byte
|
// its mask byte
|
||||||
byte[] buffer = new byte[16];
|
byte[] buffer = new byte[16];
|
||||||
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
|||||||
// are two)
|
// are two)
|
||||||
int bufferLen = 0;
|
int bufferLen = 0;
|
||||||
|
|
||||||
|
// The raw length of a code is limited to 4 bits
|
||||||
|
byte[] rawCode = new byte[16];
|
||||||
|
// And how much we're using
|
||||||
|
int rawCodeLen = 0;
|
||||||
|
|
||||||
// How far through the input and output streams we are
|
// How far through the input and output streams we are
|
||||||
int posInp = 0;
|
int posInp = 0;
|
||||||
int posOut = 0;
|
int posOut = 0;
|
||||||
@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
|||||||
// And how many bits we've already set
|
// And how many bits we've already set
|
||||||
int maskBitsSet = 0;
|
int maskBitsSet = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the last place that the bytes from rawCode are found
|
||||||
|
* at in the buffer, or -1 if they can't be found
|
||||||
|
*/
|
||||||
|
private int findRawCodeInBuffer() {
|
||||||
|
// Work our way back from the end
|
||||||
|
// (Visio always seems to use the last possible code)
|
||||||
|
for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
|
||||||
|
boolean matches = true;
|
||||||
|
for(int j=0; matches && j<rawCodeLen; j++) {
|
||||||
|
if(buffer[i] == rawCode[j]) {
|
||||||
|
// Fits
|
||||||
|
} else {
|
||||||
|
// Doesn't fit, can't be a match
|
||||||
|
matches = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Was this position a match?
|
||||||
|
if(matches) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not found
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Output the compressed representation for the bytes
|
||||||
|
* found in rawCode
|
||||||
|
*/
|
||||||
|
private void outputCompressed(OutputStream res) throws IOException {
|
||||||
|
// It's not worth compressing only 1 or two bytes,
|
||||||
|
// due to the overheads
|
||||||
|
// So if asked, just output uncompressed
|
||||||
|
if(rawCodeLen < 3) {
|
||||||
|
for(int i=0; i<rawCodeLen; i++) {
|
||||||
|
outputUncompressed(rawCode[i], res);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increment the mask bit count, we've done another code
|
||||||
|
maskBitsSet++;
|
||||||
|
// Add the length+code to the buffer
|
||||||
|
// TODO
|
||||||
|
posOut += 2;
|
||||||
|
|
||||||
|
// If we're now at 8 codes, output
|
||||||
|
if(maskBitsSet == 8) {
|
||||||
|
output8Codes(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Output the un-compressed byte
|
||||||
|
*/
|
||||||
|
private void outputUncompressed(byte b, OutputStream res) throws IOException {
|
||||||
|
// Set the mask bit for us
|
||||||
|
nextMask += (1<<maskBitsSet);
|
||||||
|
|
||||||
|
// And add us to the buffer + dictionary
|
||||||
|
buffer[bufferLen] = fromInt(b);
|
||||||
|
bufferLen++;
|
||||||
|
dict[(posOut&4095)] = fromInt(b);
|
||||||
|
posOut++;
|
||||||
|
|
||||||
|
// If we're now at 8 codes, output
|
||||||
|
if(maskBitsSet == 8) {
|
||||||
|
output8Codes(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We've got 8 code worth to write out, so
|
||||||
|
* output along with the header
|
||||||
|
*/
|
||||||
|
private void output8Codes(OutputStream res) throws IOException {
|
||||||
|
// Output the mask and the data
|
||||||
|
res.write(new byte[] { fromInt(nextMask) } );
|
||||||
|
res.write(buffer, 0, bufferLen);
|
||||||
|
|
||||||
|
// Reset things
|
||||||
|
nextMask = 0;
|
||||||
|
maskBitsSet = 0;
|
||||||
|
bufferLen = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does the compression
|
||||||
|
*/
|
||||||
|
private void compress(InputStream src, OutputStream res) throws IOException {
|
||||||
|
// Have we hit the end of the file yet?
|
||||||
|
boolean going = true;
|
||||||
|
|
||||||
// This is a byte as looked up in the dictionary
|
// This is a byte as looked up in the dictionary
|
||||||
// It needs to be signed, as it'll get passed on to
|
// It needs to be signed, as it'll get passed on to
|
||||||
// the output stream
|
// the output stream
|
||||||
@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
|
|||||||
// It needs to be unsigned, so that bit stuff works
|
// It needs to be unsigned, so that bit stuff works
|
||||||
int dataI;
|
int dataI;
|
||||||
|
|
||||||
// Have we hit the end of the file yet?
|
|
||||||
boolean going = true;
|
|
||||||
|
|
||||||
while( going ) {
|
while( going ) {
|
||||||
dataI = src.read();
|
dataI = src.read();
|
||||||
posInp++;
|
posInp++;
|
||||||
if(dataI == -1) { going = false; }
|
if(dataI == -1) { going = false; }
|
||||||
|
dataB = fromInt(dataI);
|
||||||
|
|
||||||
// Decide if we're going to output uncompressed or compressed
|
// If we've run out of data, output anything that's
|
||||||
// for this byte
|
// pending then finish
|
||||||
// (It takes 2 bytes to hold a compressed code, so it's only
|
if(!going && rawCodeLen > 0) {
|
||||||
// worth doing for 3+ byte long sequences)
|
outputCompressed(res);
|
||||||
// TODO
|
break;
|
||||||
|
|
||||||
boolean compressThis = true;
|
|
||||||
if(compressThis) {
|
|
||||||
// Set the mask bit for us
|
|
||||||
nextMask += (1<<maskBitsSet);
|
|
||||||
|
|
||||||
// And add us to the buffer + dictionary
|
|
||||||
buffer[bufferLen] = fromInt(dataI);
|
|
||||||
bufferLen++;
|
|
||||||
dict[(posOut&4095)] = fromInt(dataI);
|
|
||||||
posOut++;
|
|
||||||
} else {
|
|
||||||
// ????
|
|
||||||
}
|
}
|
||||||
// Increment the mask bit count, we've done another code
|
|
||||||
maskBitsSet++;
|
|
||||||
|
|
||||||
// If we've just done the 8th bit, or reached the end
|
// Try adding this new byte onto rawCode, and
|
||||||
// of the stream, output our mask and data
|
// see if all of that is still found in the
|
||||||
if(maskBitsSet == 8 || !going) {
|
// buffer dictionary or not
|
||||||
// Output
|
rawCode[rawCodeLen] = dataB;
|
||||||
res.write(new byte[] { fromInt(nextMask) } );
|
rawCodeLen++;
|
||||||
res.write(buffer, 0, bufferLen);
|
int rawAt = findRawCodeInBuffer();
|
||||||
|
|
||||||
// Reset things
|
// If we found it and are now at 16 bytes,
|
||||||
nextMask = 0;
|
// we need to output our pending code block
|
||||||
maskBitsSet = 0;
|
if(rawCodeLen == 16 && rawAt > -1) {
|
||||||
bufferLen = 0;
|
outputCompressed(res);
|
||||||
|
rawCodeLen = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we did find all of rawCode with our new
|
||||||
|
// byte added on, we can wait to see what happens
|
||||||
|
// with the next byte
|
||||||
|
if(rawAt > -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there was something in rawCode before, then we
|
||||||
|
// need to output that
|
||||||
|
rawCodeLen--;
|
||||||
|
if(rawCodeLen > 0) {
|
||||||
|
// Output the old rawCode
|
||||||
|
outputCompressed(res);
|
||||||
|
|
||||||
|
// Can this byte start a new rawCode, or does
|
||||||
|
// it need outputting itself?
|
||||||
|
rawCode[0] = dataB;
|
||||||
|
rawCodeLen = 1;
|
||||||
|
if(findRawCodeInBuffer() > -1) {
|
||||||
|
// Fits in, wait for next byte
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
// Doesn't fit, output
|
||||||
|
outputUncompressed(dataB,res);
|
||||||
|
rawCodeLen = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Nothing in rawCode before, so this byte
|
||||||
|
// isn't in the buffer dictionary
|
||||||
|
// Output it un-compressed
|
||||||
|
outputUncompressed(dataB,res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user