A bit more on HDGF LZW compression, but it's still not quite complete

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
2007-10-27 22:50:41 +00:00 · 2007-10-27 22:50:41 +00:00 · a1ed3f51e4
commit a1ed3f51e4
parent 90b1f22b4d
1 changed files with 164 additions and 36 deletions
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
 /**
 * Performs the Visio compatible streaming LZW compression.
 * Works by:
 * 1) ???
 * 2) ???
 * TODO - Finish
 */
 public void compress(InputStream src, OutputStream res) throws IOException {
 	Compressor c = new Compressor();
 	c.compress(src, res);
 }
 /**
 * Helper class to handle the Visio compatible
 *  streaming LZW compression.
 * Need our own class to handle keeping track of the
 *  code buffer, pending bytes to write out etc.
 */
 private class Compressor {
 	// We use 12 bit codes:
 	// * 0-255 are real bytes
 	// * 256-4095 are the substring codes
 	// Java handily initialises our buffer / dictionary
 	//  to all zeros
 	byte[] dict = new byte[4096];
 	// The next block of data to be written out, minus
 	//  its mask byte
 	byte[] buffer = new byte[16];
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	//   are two)
 	int bufferLen = 0;
 	// The raw length of a code is limited to 4 bits
 	byte[] rawCode = new byte[16];
 	// And how much we're using
 	int rawCodeLen = 0;
 	// How far through the input and output streams we are
 	int posInp = 0;
 	int posOut = 0;
@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	// And how many bits we've already set
 	int maskBitsSet = 0;
 /**
 * Returns the last place that the bytes from rawCode are found
 *  at in the buffer, or -1 if they can't be found
 */
 private int findRawCodeInBuffer() {
 	// Work our way back from the end
 	// (Visio always seems to use the last possible code)
 	for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
 		boolean matches = true;
 		for(int j=0; matches && j<rawCodeLen; j++) {
 			if(buffer[i] == rawCode[j]) {
 				// Fits
 			} else {
 				// Doesn't fit, can't be a match
 				matches = false;
 			}
 		}
 		// Was this position a match?
 		if(matches) {
 			return i;
 		}
 	}
 	// Not found
 	return -1;
 }
 /**
 * Output the compressed representation for the bytes
 *  found in rawCode
 */
 private void outputCompressed(OutputStream res) throws IOException {
 	// It's not worth compressing only 1 or two bytes,
 	//  due to the overheads
 	// So if asked, just output uncompressed
 	if(rawCodeLen < 3) {
 		for(int i=0; i<rawCodeLen; i++) {
 			outputUncompressed(rawCode[i], res);
 		}
 		return;
 	}
 	// Increment the mask bit count, we've done another code
 	maskBitsSet++;
 	// Add the length+code to the buffer
 	// TODO
 	posOut += 2;
 	// If we're now at 8 codes, output
 	if(maskBitsSet == 8) {
 		output8Codes(res);
 	}
 }
 /**
 * Output the un-compressed byte
 */
 private void outputUncompressed(byte b, OutputStream res) throws IOException {
 	// Set the mask bit for us 
 	nextMask += (1<<maskBitsSet);
 	// And add us to the buffer + dictionary
 	buffer[bufferLen] = fromInt(b);
 	bufferLen++;
 	dict[(posOut&4095)] = fromInt(b);
 	posOut++;
 	// If we're now at 8 codes, output
 	if(maskBitsSet == 8) {
 		output8Codes(res);
 	}
 }
 /**
 * We've got 8 code worth to write out, so
 *  output along with the header
 */
 private void output8Codes(OutputStream res) throws IOException {
 	// Output the mask and the data
 	res.write(new byte[] { fromInt(nextMask) } );
 	res.write(buffer, 0, bufferLen);
 	// Reset things
 	nextMask = 0;
 	maskBitsSet = 0;
 	bufferLen = 0;
 }
 /**
 * Does the compression
 */
 private void compress(InputStream src, OutputStream res) throws IOException {
 	// Have we hit the end of the file yet?
 	boolean going = true;
 	// This is a byte as looked up in the dictionary
 	// It needs to be signed, as it'll get passed on to
 	//  the output stream
@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	// It needs to be unsigned, so that bit stuff works
 	int dataI;
 	// Have we hit the end of the file yet?
 	boolean going = true;
 	while( going ) {
 		dataI = src.read();
 		posInp++;
 		if(dataI == -1) { going = false; }
 		dataB = fromInt(dataI);
-		// Decide if we're going to output uncompressed or compressed
+		// If we've run out of data, output anything that's
-		//  for this byte
+		//  pending then finish
-		// (It takes 2 bytes to hold a compressed code, so it's only
+		if(!going && rawCodeLen > 0) {
-		//  worth doing for 3+ byte long sequences)
+			outputCompressed(res);
-		// TODO
+			break;
 		boolean compressThis = true;
 		if(compressThis) {
 			// Set the mask bit for us 
 			nextMask += (1<<maskBitsSet);
 			// And add us to the buffer + dictionary
 			buffer[bufferLen] = fromInt(dataI);
 			bufferLen++;
 			dict[(posOut&4095)] = fromInt(dataI);
 			posOut++;
 		} else {
 			// ????
 		}
 		// Increment the mask bit count, we've done another code
 		maskBitsSet++;
-		// If we've just done the 8th bit, or reached the end
+		// Try adding this new byte onto rawCode, and
-		//  of the stream, output our mask and data
+		//  see if all of that is still found in the
-		if(maskBitsSet == 8 || !going) {
+		//  buffer dictionary or not
-			// Output
+		rawCode[rawCodeLen] = dataB;
-			res.write(new byte[] { fromInt(nextMask) } );
+		rawCodeLen++;
-			res.write(buffer, 0, bufferLen);
+		int rawAt = findRawCodeInBuffer();
-			// Reset things
+		// If we found it and are now at 16 bytes,
-			nextMask = 0;
+		//  we need to output our pending code block
-			maskBitsSet = 0;
+		if(rawCodeLen == 16 && rawAt > -1) {
-			bufferLen = 0;
+			outputCompressed(res);
 			rawCodeLen = 0;
 			continue;
 		}
 		// If we did find all of rawCode with our new
 		//  byte added on, we can wait to see what happens
 		//  with the next byte
 		if(rawAt > -1) {
 			continue;
 		}
 		// If there was something in rawCode before, then we
 		//  need to output that
 		rawCodeLen--;
 		if(rawCodeLen > 0) {
 			// Output the old rawCode
 			outputCompressed(res);
 			// Can this byte start a new rawCode, or does
 			//  it need outputting itself?
 			rawCode[0] = dataB;
 			rawCodeLen = 1;
 			if(findRawCodeInBuffer() > -1) {
 				// Fits in, wait for next byte
 				continue;
 			} else {
 				// Doesn't fit, output
 				outputUncompressed(dataB,res);
 				rawCodeLen = 0;
 			}
 		} else {
 			// Nothing in rawCode before, so this byte
 			//  isn't in the buffer dictionary
 			// Output it un-compressed
 			outputUncompressed(dataB,res);
 		}
 	}
 }
 }
 }