A bit more on HDGF LZW compression, but it's still not quite complete

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
2007-10-27 22:50:41 +00:00 · 2007-10-27 22:50:41 +00:00 · a1ed3f51e4
commit a1ed3f51e4
parent 90b1f22b4d
1 changed files with 164 additions and 36 deletions
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
 /**
 * Performs the Visio compatible streaming LZW compression.
 * Works by:
 * 1) ???
 * 2) ???
 * TODO - Finish
 */
 public void compress(InputStream src, OutputStream res) throws IOException {
 	Compressor c = new Compressor();
 	c.compress(src, res);
 }
 /**
 * Helper class to handle the Visio compatible
 *  streaming LZW compression.
 * Need our own class to handle keeping track of the
 *  code buffer, pending bytes to write out etc.
 */
 private class Compressor {
 	// We use 12 bit codes:
 	// * 0-255 are real bytes
 	// * 256-4095 are the substring codes
 	// Java handily initialises our buffer / dictionary
 	//  to all zeros
 	byte[] dict = new byte[4096];
 	// The next block of data to be written out, minus
 	//  its mask byte
 	byte[] buffer = new byte[16];
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	//   are two)
 	int bufferLen = 0;
 	// The raw length of a code is limited to 4 bits
 	byte[] rawCode = new byte[16];
 	// And how much we're using
 	int rawCodeLen = 0;
 	// How far through the input and output streams we are
 	int posInp = 0;
 	int posOut = 0;
@ -199,48 +213,85 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	// And how many bits we've already set
 	int maskBitsSet = 0;
-	// This is a byte as looked up in the dictionary
+/**
-	// It needs to be signed, as it'll get passed on to
+ * Returns the last place that the bytes from rawCode are found
-	//  the output stream
+ *  at in the buffer, or -1 if they can't be found
-	byte dataB;
+ */
-	// This is an unsigned byte read from the stream
+private int findRawCodeInBuffer() {
-	// It needs to be unsigned, so that bit stuff works
+	// Work our way back from the end
-	int dataI;
+	// (Visio always seems to use the last possible code)
 	for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
 		boolean matches = true;
 		for(int j=0; matches && j<rawCodeLen; j++) {
 			if(buffer[i] == rawCode[j]) {
 				// Fits
 			} else {
 				// Doesn't fit, can't be a match
 				matches = false;
 			}
 		}
-	// Have we hit the end of the file yet?
+		// Was this position a match?
-	boolean going = true;
+		if(matches) {
 			return i;
 		}
 	}
-	while( going ) {
+	// Not found
-		dataI = src.read();
+	return -1;
-		posInp++;
+}
 		if(dataI == -1) { going = false; }
-		// Decide if we're going to output uncompressed or compressed
+/**
-		//  for this byte
+ * Output the compressed representation for the bytes
-		// (It takes 2 bytes to hold a compressed code, so it's only
+ *  found in rawCode
-		//  worth doing for 3+ byte long sequences)
+ */
 private void outputCompressed(OutputStream res) throws IOException {
 	// It's not worth compressing only 1 or two bytes,
 	//  due to the overheads
 	// So if asked, just output uncompressed
 	if(rawCodeLen < 3) {
 		for(int i=0; i<rawCodeLen; i++) {
 			outputUncompressed(rawCode[i], res);
 		}
 		return;
 	}
 	// Increment the mask bit count, we've done another code
 	maskBitsSet++;
 	// Add the length+code to the buffer
 	// TODO
 	posOut += 2;
-		boolean compressThis = true;
+	// If we're now at 8 codes, output
-		if(compressThis) {
+	if(maskBitsSet == 8) {
 		output8Codes(res);
 	}
 }
 /**
 * Output the un-compressed byte
 */
 private void outputUncompressed(byte b, OutputStream res) throws IOException {
 	// Set the mask bit for us 
 	nextMask += (1<<maskBitsSet);
 	// And add us to the buffer + dictionary
-			buffer[bufferLen] = fromInt(dataI);
+	buffer[bufferLen] = fromInt(b);
 	bufferLen++;
-			dict[(posOut&4095)] = fromInt(dataI);
+	dict[(posOut&4095)] = fromInt(b);
 	posOut++;
 		} else {
 			// ????
 		}
 		// Increment the mask bit count, we've done another code
 		maskBitsSet++;
-		// If we've just done the 8th bit, or reached the end
+	// If we're now at 8 codes, output
-		//  of the stream, output our mask and data
+	if(maskBitsSet == 8) {
-		if(maskBitsSet == 8 || !going) {
+		output8Codes(res);
-			// Output
+	}
 }
 /**
 * We've got 8 code worth to write out, so
 *  output along with the header
 */
 private void output8Codes(OutputStream res) throws IOException {
 	// Output the mask and the data
 	res.write(new byte[] { fromInt(nextMask) } );
 	res.write(buffer, 0, bufferLen);
@ -249,6 +300,83 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	maskBitsSet = 0;
 	bufferLen = 0;
 }
 /**
 * Does the compression
 */
 private void compress(InputStream src, OutputStream res) throws IOException {
 	// Have we hit the end of the file yet?
 	boolean going = true;
 	// This is a byte as looked up in the dictionary
 	// It needs to be signed, as it'll get passed on to
 	//  the output stream
 	byte dataB;
 	// This is an unsigned byte read from the stream
 	// It needs to be unsigned, so that bit stuff works
 	int dataI;
 	while( going ) {
 		dataI = src.read();
 		posInp++;
 		if(dataI == -1) { going = false; }
 		dataB = fromInt(dataI);
 		// If we've run out of data, output anything that's
 		//  pending then finish
 		if(!going && rawCodeLen > 0) {
 			outputCompressed(res);
 			break;
 		}
 		// Try adding this new byte onto rawCode, and
 		//  see if all of that is still found in the
 		//  buffer dictionary or not
 		rawCode[rawCodeLen] = dataB;
 		rawCodeLen++;
 		int rawAt = findRawCodeInBuffer();
 		// If we found it and are now at 16 bytes,
 		//  we need to output our pending code block
 		if(rawCodeLen == 16 && rawAt > -1) {
 			outputCompressed(res);
 			rawCodeLen = 0;
 			continue;
 		}
 		// If we did find all of rawCode with our new
 		//  byte added on, we can wait to see what happens
 		//  with the next byte
 		if(rawAt > -1) {
 			continue;
 		}
 		// If there was something in rawCode before, then we
 		//  need to output that
 		rawCodeLen--;
 		if(rawCodeLen > 0) {
 			// Output the old rawCode
 			outputCompressed(res);
 			// Can this byte start a new rawCode, or does
 			//  it need outputting itself?
 			rawCode[0] = dataB;
 			rawCodeLen = 1;
 			if(findRawCodeInBuffer() > -1) {
 				// Fits in, wait for next byte
 				continue;
 			} else {
 				// Doesn't fit, output
 				outputUncompressed(dataB,res);
 				rawCodeLen = 0;
 			}
 		} else {
 			// Nothing in rawCode before, so this byte
 			//  isn't in the buffer dictionary
 			// Output it un-compressed
 			outputUncompressed(dataB,res);
 		}
 	}
 }
 }