Refactor the common LZW decompression code out into utils

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1051377 13f79535-47bb-0310-9956-ffa450edef68
2010-12-21 05:18:34 +00:00 · 2010-12-21 05:18:34 +00:00 · d2583d83d5
commit d2583d83d5
parent b31c0f88da
4 changed files with 207 additions and 129 deletions
--- a/src/java/org/apache/poi/util/LZWDecompresser.java
+++ b/src/java/org/apache/poi/util/LZWDecompresser.java
@ -0,0 +1,178 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.util;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 /**
 * This class provides common functionality for the
 *  various LZW implementations in the different file
 *  formats.
 * It's currently used by HDGF and HMEF.
 *
 * Two good resources on LZW are:
 *  http://en.wikipedia.org/wiki/LZW
 *  http://marknelson.us/1989/10/01/lzw-data-compression/
 */
 public abstract class LZWDecompresser {
   /**
    * Does the mask bit mean it's compressed or uncompressed?
    */
   private boolean maskMeansCompressed;
   protected LZWDecompresser(boolean maskMeansCompressed) {
      this.maskMeansCompressed = maskMeansCompressed;
   }
   /**
    * Populates the dictionary. May not need
    *  to do anything if all zeros is fine.
    */
   protected abstract void populateDictionary(byte[] dict);
   /**
    * Adjusts the position offset if needed when looking
    *  something up in the dictionary.
    */
   protected abstract int adjustDictionaryOffset(int offset); 
   /**
    * Decompresses the given input stream, returning the array of bytes
    *  of the decompressed input.
    */
   public byte[] decompress(InputStream src) throws IOException {
      ByteArrayOutputStream res = new ByteArrayOutputStream();
      decompress(src,res);
      return res.toByteArray();
   }
   /**
    * Perform a streaming decompression of the input.
    * Works by:
    * 1) Reading a flag byte, the 8 bits of which tell you if the
    *     following 8 codes are compressed our un-compressed
    * 2) Consider the 8 bits in turn
    * 3) If the bit is set, the next code is un-compressed, so
    *     add it to the dictionary and output it
    * 4) If the bit isn't set, then read in the length and start
    *     position in the dictionary, and output the bytes there
    * 5) Loop until we've done all 8 bits, then read in the next
    *     flag byte
    */
   public void decompress(InputStream src, OutputStream res) throws IOException {
      // We use 12 bit codes:
      // * 0-255 are real bytes
      // * 256-4095 are the substring codes
      // Java handily initialises our buffer / dictionary
      //  to all zeros
      byte[] buffer = new byte[4096];
      populateDictionary(buffer);
      // How far through the output we've got
      // (This is normally used &4095, so it nicely wraps)
      int pos = 0;
      // The flag byte is treated as its 8 individual
      //  bits, which tell us if the following 8 codes
      //  are compressed or un-compressed
      int flag;
      // The mask, between 1 and 255, which is used when
      //  processing each bit of the flag byte in turn
      int mask;
      // These are bytes as looked up in the dictionary
      // It needs to be signed, as it'll get passed on to
      //  the output stream
      byte[] dataB = new byte[19];
      // This is an unsigned byte read from the stream
      // It needs to be unsigned, so that bit stuff works
      int dataI;
      // The compressed code sequence is held over 2 bytes
      int dataIPt1, dataIPt2;
      // How long a code sequence is, and where in the
      //  dictionary to start at
      int len, pntr;
      while( (flag = src.read()) != -1 ) {
         // Compare each bit in our flag byte in turn:
         for(mask = 1; mask < 256 ; mask <<= 1) {
            // Is this a new code (un-compressed), or
            //  the use of existing codes (compressed)?
            boolean isMaskSet = (flag & mask) > 0;
            if( isMaskSet && !maskMeansCompressed ) {
               // Retrieve the un-compressed code
               if( (dataI = src.read()) != -1) {
                  // Save the byte into the dictionary
                  buffer[(pos&4095)] = fromInt(dataI);
                  pos++;
                  // And output the byte
                  res.write( new byte[] {fromInt(dataI)} );
               }
            } else {
               // We have a compressed sequence
               // Grab the next 16 bits of data
               dataIPt1 = src.read();
               dataIPt2 = src.read();
               if(dataIPt1 == -1 || dataIPt2 == -1) break;
               // Build up how long the code sequence is, and
               //  what position of the code to start at
               // (The position is the first 12 bits, the
               //  length is the last 4 bits)
               len = (dataIPt2 & 15) + 3;
               pntr = (dataIPt2 & 240)*16 + dataIPt1;
               // Adjust the pointer as needed
               pntr = adjustDictionaryOffset(pntr);
               // Loop over the codes, outputting what they correspond to
               for(int i=0; i<len; i++) {
                  dataB[i] = buffer[(pntr + i) & 4095];
                  buffer[ (pos + i) & 4095 ] = dataB[i];
               }
               res.write(dataB, 0, len);
               // Record how far along the stream we have moved
               pos = pos + len;
            }
         }
      }
   }
   /**
    * Given an integer, turn it into a java byte, handling
    *  the wrapping.
    * This is a convenience method
    */
   public static byte fromInt(int b) {
      if(b < 128) return (byte)b;
      return (byte)(b - 256);
   }
   /**
    * Given a java byte, turn it into an integer between 0
    *  and 255 (i.e. handle the unwrapping).
    * This is a convenience method
    */
   public static int fromByte(byte b) {
      if(b >= 0) {
         return b;
      }
      return b + 256;
   }
 }
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -21,6 +21,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import org.apache.poi.util.LZWDecompresser;
 /**
 * A decoder for the crazy LZW implementation used
 *  in Visio.
@ -33,27 +35,10 @@ import java.io.OutputStream;
 *  http://en.wikipedia.org/wiki/LZW
 *  http://marknelson.us/1989/10/01/lzw-data-compression/
 */
-public class HDGFLZW {
+public class HDGFLZW extends LZWDecompresser {
-
+   public HDGFLZW() {
-   /**
+      // We're the wrong way round!
-    * Given an integer, turn it into a java byte, handling
+      super(false);
    *  the wrapping.
    * This is a convenience method
    */
   public static byte fromInt(int b) {
      if(b < 128) return (byte)b;
      return (byte)(b - 256);
   }
   /**
    * Given a java byte, turn it into an integer between 0
    *  and 255 (i.e. handle the unwrapping).
    * This is a convenience method
    */
   public static int fromByte(byte b) {
      if(b >= 0) {
         return b;
      }
      return b + 256;
   }
   /**
@ -67,108 +52,23 @@ public class HDGFLZW {
   }
   /**
-    * Decompresses the given input stream, returning the array of bytes
+    * We have a slight shift by 18 bytes
    *  of the decompressed input.
    */
-   public byte[] decode(InputStream src) throws IOException {
+   @Override
-      ByteArrayOutputStream res = new ByteArrayOutputStream();
+   protected int adjustDictionaryOffset(int pntr) {
-      decode(src,res);
+      if(pntr > 4078) {
-      return res.toByteArray();
+         pntr = pntr - 4078;
      } else {
         pntr = pntr + 18;
      }
      return pntr;
   }
   /**
-    * Perform a streaming decompression of the input.
+    * We want an empty dictionary, so do nothing
    * Works by:
    * 1) Reading a flag byte, the 8 bits of which tell you if the
    *     following 8 codes are compressed our un-compressed
    * 2) Consider the 8 bits in turn
    * 3) If the bit is set, the next code is un-compressed, so
    *     add it to the dictionary and output it
    * 4) If the bit isn't set, then read in the length and start
    *     position in the dictionary, and output the bytes there
    * 5) Loop until we've done all 8 bits, then read in the next
    *     flag byte
    */
-   public void decode(InputStream src, OutputStream res) throws IOException {
+   @Override
-      // We use 12 bit codes:
+   protected void populateDictionary(byte[] dict) {
      // * 0-255 are real bytes
      // * 256-4095 are the substring codes
      // Java handily initialises our buffer / dictionary
      //  to all zeros
      byte[] buffer = new byte[4096];
      // How far through the output we've got
      // (This is normally used &4095, so it nicely wraps)
      int pos = 0;
      // The flag byte is treated as its 8 individual
      //  bits, which tell us if the following 8 codes
      //  are compressed or un-compressed
      int flag;
      // The mask, between 1 and 255, which is used when
      //  processing each bit of the flag byte in turn
      int mask;
      // These are bytes as looked up in the dictionary
      // It needs to be signed, as it'll get passed on to
      //  the output stream
      byte[] dataB = new byte[19];
      // This is an unsigned byte read from the stream
      // It needs to be unsigned, so that bit stuff works
      int dataI;
      // The compressed code sequence is held over 2 bytes
      int dataIPt1, dataIPt2;
      // How long a code sequence is, and where in the
      //  dictionary to start at
      int len, pntr;
      while( (flag = src.read()) != -1 ) {
         // Compare each bit in our flag byte in turn:
         for(mask = 1; mask < 256 ; mask <<= 1) {
            // Is this a new code (un-compressed), or
            //  the use of existing codes (compressed)?
            if( (flag & mask) > 0 ) {
               // Retrieve the un-compressed code
               if( (dataI = src.read()) != -1) {
                  // Save the byte into the dictionary
                  buffer[(pos&4095)] = fromInt(dataI);
                  pos++;
                  // And output the byte
                  res.write( new byte[] {fromInt(dataI)} );
               }
            } else {
               // We have a compressed sequence
               // Grab the next 16 bits of data
               dataIPt1 = src.read();
               dataIPt2 = src.read();
               if(dataIPt1 == -1 || dataIPt2 == -1) break;
               // Build up how long the code sequence is, and
               //  what position of the code to start at
               // (The position is the first 12 bits, the
               //  length is the last 4 bits)
               len = (dataIPt2 & 15) + 3;
               pntr = (dataIPt2 & 240)*16 + dataIPt1;
               // If the pointer happens to be passed the end
               //  of our buffer, then wrap around
               if(pntr > 4078) {
                  pntr = pntr - 4078;
               } else {
                  pntr = pntr + 18;
               }
               // Loop over the codes, outputting what they correspond to
               for(int i=0; i<len; i++) {
                  dataB[i] = buffer[(pntr + i) & 4095];
                  buffer[ (pos + i) & 4095 ] = dataB[i];
               }
               res.write(dataB, 0, len);
               // Record how far along the stream we have moved
               pos = pos + len;
            }
         }
      }
   }
   /**
--- a/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
@ -78,7 +78,7 @@ public final class CompressedStreamStore extends StreamStore {
 		// Decompress
 		HDGFLZW lzw = new HDGFLZW();
-		byte[] decompressed = lzw.decode(bais);
+		byte[] decompressed = lzw.decompress(bais);
 		// Split into header and contents
 		byte[][] ret = new byte[2][];
--- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
+++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
@ -139,9 +139,9 @@ public final class TestHDGFLZW extends TestCase {
 		assertEquals(339, testTrailerComp.length);
 		assertEquals(632, testTrailerDecomp.length);
-		// Decode it using our engine
+		// decompress it using our engine
 		HDGFLZW lzw = new HDGFLZW();
-		byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+		byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
 		// Check it's of the right size
 		assertEquals(632, dec.length);
@ -159,9 +159,9 @@ public final class TestHDGFLZW extends TestCase {
 		assertEquals(339, testTrailerComp.length);
 		assertEquals(632, testTrailerDecomp.length);
-		// Decode it using our engine
+		// decompress it using our engine
 		HDGFLZW lzw = new HDGFLZW();
-		byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+		byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
 		// Now check it's the right data
 		assertEquals(632, dec.length);
@ -188,7 +188,7 @@ public final class TestHDGFLZW extends TestCase {
 		byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
 		// Now decompress it again
-		byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+		byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
 		// First up, check the round tripping
 		assertEquals(12, decomp.length);
@ -223,7 +223,7 @@ public final class TestHDGFLZW extends TestCase {
      assertEquals(27, comp.length);
      // Now decompress it again
-      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+      byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
      // We can only check the round-tripping, as for now
      //  visio cheats on re-using a block
@ -246,7 +246,7 @@ public final class TestHDGFLZW extends TestCase {
      byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
      // Now decompress it again
-      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+      byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
 //      for(int i=0; i<comp.length; i++) {
 //         System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);