A bit more on HDGF LZW compression, but it's still not quite complete

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589233 13f79535-47bb-0310-9956-ffa450edef68
2007-10-27 22:50:41 +00:00 · 2007-10-27 22:50:41 +00:00 · a1ed3f51e4
commit a1ed3f51e4
parent 90b1f22b4d
1 changed files with 164 additions and 36 deletions
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {

 /**
 * Performs the Visio compatible streaming LZW compression.
- * Works by:
- * 1) ???
- * 2) ???
 * TODO - Finish
 */
 public void compress(InputStream src, OutputStream res) throws IOException {
+	Compressor c = new Compressor();
+	c.compress(src, res);
+}
+
+/**
+ * Helper class to handle the Visio compatible
+ *  streaming LZW compression.
+ * Need our own class to handle keeping track of the
+ *  code buffer, pending bytes to write out etc.
+ */
+private class Compressor {
 	// We use 12 bit codes:
 	// * 0-255 are real bytes
 	// * 256-4095 are the substring codes
 	// Java handily initialises our buffer / dictionary
 	//  to all zeros
 	byte[] dict = new byte[4096];
+	
 	// The next block of data to be written out, minus
 	//  its mask byte
 	byte[] buffer = new byte[16];
@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	//   are two)
 	int bufferLen = 0;
 	
+	// The raw length of a code is limited to 4 bits
+	byte[] rawCode = new byte[16];
+	// And how much we're using
+	int rawCodeLen = 0;
+	
 	// How far through the input and output streams we are
 	int posInp = 0;
 	int posOut = 0;
@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	// And how many bits we've already set
 	int maskBitsSet = 0;
 	
+/**
+ * Returns the last place that the bytes from rawCode are found
+ *  at in the buffer, or -1 if they can't be found
+ */
+private int findRawCodeInBuffer() {
+	// Work our way back from the end
+	// (Visio always seems to use the last possible code)
+	for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
+		boolean matches = true;
+		for(int j=0; matches && j<rawCodeLen; j++) {
+			if(buffer[i] == rawCode[j]) {
+				// Fits
+			} else {
+				// Doesn't fit, can't be a match
+				matches = false;
+			}
+		}
+		
+		// Was this position a match?
+		if(matches) {
+			return i;
+		}
+	}
+
+	// Not found
+	return -1;
+}
+
+/**
+ * Output the compressed representation for the bytes
+ *  found in rawCode
+ */
+private void outputCompressed(OutputStream res) throws IOException {
+	// It's not worth compressing only 1 or two bytes,
+	//  due to the overheads
+	// So if asked, just output uncompressed
+	if(rawCodeLen < 3) {
+		for(int i=0; i<rawCodeLen; i++) {
+			outputUncompressed(rawCode[i], res);
+		}
+		return;
+	}
+	
+	// Increment the mask bit count, we've done another code
+	maskBitsSet++;
+	// Add the length+code to the buffer
+	// TODO
+	posOut += 2;
+	
+	// If we're now at 8 codes, output
+	if(maskBitsSet == 8) {
+		output8Codes(res);
+	}
+}
+/**
+ * Output the un-compressed byte
+ */
+private void outputUncompressed(byte b, OutputStream res) throws IOException {
+	// Set the mask bit for us 
+	nextMask += (1<<maskBitsSet);
+	
+	// And add us to the buffer + dictionary
+	buffer[bufferLen] = fromInt(b);
+	bufferLen++;
+	dict[(posOut&4095)] = fromInt(b);
+	posOut++;
+	
+	// If we're now at 8 codes, output
+	if(maskBitsSet == 8) {
+		output8Codes(res);
+	}
+}
+
+/**
+ * We've got 8 code worth to write out, so
+ *  output along with the header
+ */
+private void output8Codes(OutputStream res) throws IOException {
+	// Output the mask and the data
+	res.write(new byte[] { fromInt(nextMask) } );
+	res.write(buffer, 0, bufferLen);
+	
+	// Reset things
+	nextMask = 0;
+	maskBitsSet = 0;
+	bufferLen = 0;
+}
+	
+/**
+ * Does the compression
+ */
+private void compress(InputStream src, OutputStream res) throws IOException {
+	// Have we hit the end of the file yet?
+	boolean going = true;
+	
 	// This is a byte as looked up in the dictionary
 	// It needs to be signed, as it'll get passed on to
 	//  the output stream
@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
 	// It needs to be unsigned, so that bit stuff works
 	int dataI;
 	
-	// Have we hit the end of the file yet?
-	boolean going = true;
-	
 	while( going ) {
 		dataI = src.read();
 		posInp++;
 		if(dataI == -1) { going = false; }
+		dataB = fromInt(dataI);
 		
-		// Decide if we're going to output uncompressed or compressed
-		//  for this byte
-		// (It takes 2 bytes to hold a compressed code, so it's only
-		//  worth doing for 3+ byte long sequences)
-		// TODO
-		
-		boolean compressThis = true;
-		if(compressThis) {
-			// Set the mask bit for us 
-			nextMask += (1<<maskBitsSet);
-			
-			// And add us to the buffer + dictionary
-			buffer[bufferLen] = fromInt(dataI);
-			bufferLen++;
-			dict[(posOut&4095)] = fromInt(dataI);
-			posOut++;
-		} else {
-			// ????
+		// If we've run out of data, output anything that's
+		//  pending then finish
+		if(!going && rawCodeLen > 0) {
+			outputCompressed(res);
+			break;
 		}
-		// Increment the mask bit count, we've done another code
-		maskBitsSet++;
 	
-		// If we've just done the 8th bit, or reached the end
-		//  of the stream, output our mask and data
-		if(maskBitsSet == 8 || !going) {
-			// Output
-			res.write(new byte[] { fromInt(nextMask) } );
-			res.write(buffer, 0, bufferLen);
+		// Try adding this new byte onto rawCode, and
+		//  see if all of that is still found in the
+		//  buffer dictionary or not
+		rawCode[rawCodeLen] = dataB;
+		rawCodeLen++;
+		int rawAt = findRawCodeInBuffer();
 		
-			// Reset things
-			nextMask = 0;
-			maskBitsSet = 0;
-			bufferLen = 0;
+		// If we found it and are now at 16 bytes,
+		//  we need to output our pending code block
+		if(rawCodeLen == 16 && rawAt > -1) {
+			outputCompressed(res);
+			rawCodeLen = 0;
+			continue;
+		}
+		
+		// If we did find all of rawCode with our new
+		//  byte added on, we can wait to see what happens
+		//  with the next byte
+		if(rawAt > -1) {
+			continue;
+		}
+		
+		// If there was something in rawCode before, then we
+		//  need to output that
+		rawCodeLen--;
+		if(rawCodeLen > 0) {
+			// Output the old rawCode
+			outputCompressed(res);
+			
+			// Can this byte start a new rawCode, or does
+			//  it need outputting itself?
+			rawCode[0] = dataB;
+			rawCodeLen = 1;
+			if(findRawCodeInBuffer() > -1) {
+				// Fits in, wait for next byte
+				continue;
+			} else {
+				// Doesn't fit, output
+				outputUncompressed(dataB,res);
+				rawCodeLen = 0;
+			}
+		} else {
+			// Nothing in rawCode before, so this byte
+			//  isn't in the buffer dictionary
+			// Output it un-compressed
+			outputUncompressed(dataB,res);
 		}
 	}
 }
+}

 }