From 786af85cc078bd36d168d9a6578b823ae2242f12 Mon Sep 17 00:00:00 2001
From: Nick Burch <nick@apache.org>
Date: Sat, 13 Oct 2007 15:46:09 +0000
Subject: [PATCH] Replace the HDGW LZW engine with a fully documented, ASL
 licenced version. (Doesn't do compression yet, but is a much better start for
 that)

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@584414 13f79535-47bb-0310-9956-ffa450edef68
---
 .../src/org/apache/poi/hdgf/HDGFLZW.java      | 161 ++++++++++++++++++
 .../src/org/apache/poi/hdgf/LZW4HDGF.java     |  80 ---------
 .../hdgf/streams/CompressedStreamStore.java   |   4 +-
 .../{TestLZW4HDGF.java => TestHDGFLZW.java}   |   6 +-
 4 files changed, 166 insertions(+), 85 deletions(-)
 create mode 100644 src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
 delete mode 100644 src/scratchpad/src/org/apache/poi/hdgf/LZW4HDGF.java
 rename src/scratchpad/testcases/org/apache/poi/hdgf/{TestLZW4HDGF.java => TestHDGFLZW.java} (97%)

diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
new file mode 100644
index 000000000..91ae1a24e
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@@ -0,0 +1,161 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hdgf;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * A decoder for the crazy LZW implementation used
+ *  in Visio.
+ * According to VSDump, "it's a slightly perverted version of LZW 
+ *  compression, with inverted meaning of flag byte and 0xFEE as an 
+ *  'initial shift'". It uses 12 bit codes
+ * (http://www.gnome.ru/projects/vsdump_en.html)
+ *
+ * Two good resources on LZW are:
+ *  http://en.wikipedia.org/wiki/LZW
+ *  http://marknelson.us/1989/10/01/lzw-data-compression/
+ */
+public class HDGFLZW {
+
+/**
+ * Given an integer, turn it into a java byte, handling 
+ *  the wrapping.
+ * This is a convenience method
+ */
+public byte fromInt(int b) {
+	if(b < 128) return (byte)b;
+	return (byte)(b - 256);
+}
+/**
+ * Given a java byte, turn it into an integer between 0 
+ *  and 255 (i.e. handle the unwrapping).
+ * This is a convenience method
+ */
+public int fromByte(byte b) {
+	if(b >= 0) return (int)b;
+	return (int)(b + 256);
+}
+
+/**
+ * Decompresses the given input stream, returning the array of bytes
+ *  of the decompressed input.
+ */
+public byte[] decode(InputStream src) throws IOException {
+	ByteArrayOutputStream res = new ByteArrayOutputStream();
+	decode(src,res);
+    return res.toByteArray();
+}
+/**
+ * Perform a streaming decompression of the input.
+ * Works by:
+ * 1) Reading a flag byte, the 8 bits of which tell you if the
+ *     following 8 codes are compressed our un-compressed
+ * 2) Consider the 8 bits in turn
+ * 3) If the bit is set, the next code is un-compressed, so
+ *     add it to the dictionary and output it
+ * 4) If the bit isn't set, then read in the length and start
+ *     position in the dictionary, and output the bytes there
+ * 5) Loop until we've done all 8 bits, then read in the next
+ *     flag byte
+ */
+public void decode(InputStream src, OutputStream res) throws IOException {
+	// We use 12 bit codes:
+	// * 0-255 are real bytes
+	// * 256-4095 are the substring codes
+	// Java handily initialises our buffer / dictionary
+	//  to all zeros
+	byte[] buffer = new byte[4096];
+
+	// How far through the output we've got
+	// (This is normally used &4095, so it nicely wraps)
+	int pos = 0;
+	// The flag byte is treated as its 8 individual
+	//  bits, which tell us if the following 8 codes
+	//  are compressed or un-compressed
+	int flag;
+	// The mask, between 1 and 255, which is used when
+	//  processing each bit of the flag byte in turn
+	int mask;
+
+	// This is a byte as looked up in the dictionary
+	// It needs to be signed, as it'll get passed on to
+	//  the output stream
+	byte dataB;
+	// This is an unsigned byte read from the stream
+	// It needs to be unsigned, so that bit stuff works
+	int dataI;
+	// The compressed code sequence is held over 2 bytes
+	int dataIPt1, dataIPt2; 
+	// How long a code sequence is, and where in the
+	//  dictionary to start at
+	int len, pntr;
+
+	while( (flag = src.read()) != -1 ) {
+		// Compare each bit in our flag byte in turn:
+		for(mask = 1; mask < 256 ; mask <<= 1) {
+			// Is this a new code (un-compressed), or
+			//  the use of existing codes (compressed)?
+			if( (flag & mask) > 0 ) {
+				// Retrieve the un-compressed code
+				if( (dataI = src.read()) != -1) {
+					// Save the byte into the dictionary
+					buffer[(pos&4095)] = fromInt(dataI);
+					pos++;
+					// And output the byte
+					res.write( new byte[] {fromInt(dataI)} );
+				}
+			} else {
+				// We have a compressed sequence
+				// Grab the next 16 bits of data
+				dataIPt1 = src.read();
+				dataIPt2 = src.read();
+				if(dataIPt1 == -1 || dataIPt2 == -1) break;
+				
+				// Build up how long the code sequence is, and
+				//  what position of the code to start at
+				// (The position is the first 12 bits, the
+				//  length is the last 4 bits)
+				len = (dataIPt2 & 15) + 3;
+				pntr = (dataIPt2 & 240)*16 + dataIPt1;
+                
+				// If the pointer happens to be passed the end
+				//  of our buffer, then wrap around
+				if(pntr > 4078) {
+					pntr = pntr - 4078;
+				} else {
+					pntr = pntr + 18;
+				}
+				
+				// Loop over the codes, outputting what they correspond to
+				for(int i=0; i<len; i++) {
+					buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
+					dataB = buffer[(pntr + i) & 4095];
+					res.write(new byte[] {dataB});
+				}
+				
+				// Record how far along the stream we have moved
+				pos = pos + len;
+			}
+		}
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/LZW4HDGF.java b/src/scratchpad/src/org/apache/poi/hdgf/LZW4HDGF.java
deleted file mode 100644
index 32953a042..000000000
--- a/src/scratchpad/src/org/apache/poi/hdgf/LZW4HDGF.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/* ====================================================================
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; version 3 of the License.
-   
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-==================================================================== */
-package org.apache.poi.hdgf;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-/**
- * A decoder for the crazy LZW implementation used
- *  in Visio.
- * This is a port of vsd_inflate.c from vsdump
- *  (http://www.gnome.ru/projects/vsdump_en.html)
- */
-public class LZW4HDGF {
-
-public byte fromInt(int b) {
-	if(b < 128) return (byte)b;
-	return (byte)(b - 256);
-}
-
-public byte[] decode(InputStream src) throws IOException {
-	ByteArrayOutputStream res = new ByteArrayOutputStream();
-	int pos = 0;
-	int flag;
-	byte[] buffer = new byte[4096];
-	buffer[0] = 0;
-	
-	byte data;
-	int tmp;
-	int addr1, addr2; 
-	int len, pntr;
-
-	while ( (flag = src.read()) != -1 ) {
-		for (int mask = 1; mask < 0x100 ; mask <<= 1) {
-			if ( (flag & mask) > 0) {
-				if( (tmp = src.read()) != -1) {
-					buffer[(pos&4095)] = fromInt(tmp);
-					pos++;
-					res.write( new byte[] {fromInt(tmp)} );
-				}
-			} else {
-				tmp = src.read();
-				if(tmp == -1) break;
-				addr1 = tmp;
-                                                            
-				tmp = src.read();
-				if(tmp == -1) break;
-				addr2 = tmp;
-				
-				len = (addr2 & 15) + 3;
-				pntr = (addr2 & 240)*16 + addr1;
-                
-				if(pntr > 4078) {
-					pntr = pntr - 4078;
-				} else {
-					pntr = pntr + 18;
-				}
-				
-				for(int i=0; i<len; i++) {
-					buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
-					data = buffer[(pntr + i ) & 4095];
-					res.write(new byte[] {data});
-				}
-                                    
-				pos = pos + len;
-			}
-		}
-    }
-    return res.toByteArray();
-}
-}
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
index 8b1559624..4bf70417d 100644
--- a/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
@@ -19,7 +19,7 @@ package org.apache.poi.hdgf.streams;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 
-import org.apache.poi.hdgf.LZW4HDGF;
+import org.apache.poi.hdgf.HDGFLZW;
 
 /**
  * A StreamStore where the data on-disk is compressed,
@@ -76,7 +76,7 @@ public class CompressedStreamStore extends StreamStore {
 		ByteArrayInputStream bais = new ByteArrayInputStream(data, offset, length);
 		
 		// Decompress
-		LZW4HDGF lzw = new LZW4HDGF();
+		HDGFLZW lzw = new HDGFLZW();
 		byte[] decompressed = lzw.decode(bais);
 		
 		// Split into header and contents
diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/TestLZW4HDGF.java b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
similarity index 97%
rename from src/scratchpad/testcases/org/apache/poi/hdgf/TestLZW4HDGF.java
rename to src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
index c2576b292..3e3986eee 100644
--- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestLZW4HDGF.java
+++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
@@ -20,7 +20,7 @@ import java.io.ByteArrayInputStream;
 
 import junit.framework.TestCase;
 
-public class TestLZW4HDGF extends TestCase {
+public class TestHDGFLZW extends TestCase {
 	public static final byte[] testTrailerComp = new byte[] {
 		123, -60, 2, -21, -16, 1, 0, 0, -72, -13, -16, 78, -32, -5, 1, 
 		0, 3, -21, -16, 10, 5, 4, -21, -16, 21, 9, -21, -16, 103, -21, 
@@ -86,8 +86,8 @@ public class TestLZW4HDGF extends TestCase {
 		assertEquals(632, testTrailerDecomp.length);
 		
 		// Decode it using our engine
-		LZW4HDGF lzw2 = new LZW4HDGF();
-		byte[] dec = lzw2.decode(new ByteArrayInputStream(testTrailerComp));
+		HDGFLZW lzw = new HDGFLZW();
+		byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
 		
 		// Check it's of the right size
 		assertEquals(632, dec.length);