Two more differences between the LZW in HDGF and HMEF:

* Little Endian vs Big Endian storage of the code position * Initial dictionary position is the end of pre-fill, if there is one, rather than always being position 0 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1078300 13f79535-47bb-0310-9956-ffa450edef68
2011-03-05 15:25:39 +00:00 · 2011-03-05 15:25:39 +00:00 · 0df8c11c4b
commit 0df8c11c4b
parent 2c4134e89b
4 changed files with 61 additions and 32 deletions
--- a/src/java/org/apache/poi/util/LZWDecompresser.java
+++ b/src/java/org/apache/poi/util/LZWDecompresser.java
@ -41,23 +41,36 @@ public abstract class LZWDecompresser {
    *  to get the real code length? Normally 2 or 3
    */
   private final int codeLengthIncrease;
+   /**
+    * Does the 12 bits of the position get stored in
+    *  Little Endian or Big Endian form?
+    * This controls whether a pos+length of 0x12 0x34
+    *  becomes a position of 0x123 or 0x312
+    */
+   private final boolean positionIsBigEndian;
   
-   protected LZWDecompresser(boolean maskMeansCompressed, int codeLengthIncrease) {
+   protected LZWDecompresser(boolean maskMeansCompressed, 
+            int codeLengthIncrease, boolean positionIsBigEndian) {
      this.maskMeansCompressed = maskMeansCompressed;
      this.codeLengthIncrease = codeLengthIncrease;
+      this.positionIsBigEndian = positionIsBigEndian;
   }
   
   /**
-    * Populates the dictionary. May not need
-    *  to do anything if all zeros is fine.
+    * Populates the dictionary, and returns where in it
+    *  to begin writing new codes.
+    * Generally, if the dictionary is pre-populated, then new
+    *  codes should be placed at the end of that block.
+    * Equally, if the dictionary is left with all zeros, then
+    *  usually the new codes can go in at the start.
    */
-   protected abstract void populateDictionary(byte[] dict);
+   protected abstract int populateDictionary(byte[] dict);
   
   /**
    * Adjusts the position offset if needed when looking
    *  something up in the dictionary.
    */
-   protected abstract int adjustDictionaryOffset(int offset); 
+   protected abstract int adjustDictionaryOffset(int offset);
   
   /**
    * Decompresses the given input stream, returning the array of bytes
@ -83,17 +96,10 @@ public abstract class LZWDecompresser {
    *     flag byte
    */
   public void decompress(InputStream src, OutputStream res) throws IOException {
-      // We use 12 bit codes:
-      // * 0-255 are real bytes
-      // * 256-4095 are the substring codes
-      // Java handily initialises our buffer / dictionary
-      //  to all zeros
-      byte[] buffer = new byte[4096];
-      populateDictionary(buffer);
-
      // How far through the output we've got
      // (This is normally used &4095, so it nicely wraps)
-      int pos = 0;
+      // The initial value is set when populating the dictionary
+      int pos;
      // The flag byte is treated as its 8 individual
      //  bits, which tell us if the following 8 codes
      //  are compressed or un-compressed
@ -102,10 +108,18 @@ public abstract class LZWDecompresser {
      //  processing each bit of the flag byte in turn
      int mask;

+      // We use 12 bit codes:
+      // * 0-255 are real bytes
+      // * 256-4095 are the substring codes
+      // Java handily initialises our buffer / dictionary
+      //  to all zeros
+      byte[] buffer = new byte[4096];
+      pos = populateDictionary(buffer);
+
      // These are bytes as looked up in the dictionary
      // It needs to be signed, as it'll get passed on to
      //  the output stream
-      byte[] dataB = new byte[19];
+      byte[] dataB = new byte[16+codeLengthIncrease];
      // This is an unsigned byte read from the stream
      // It needs to be unsigned, so that bit stuff works
      int dataI;
@ -121,7 +135,7 @@ public abstract class LZWDecompresser {
            // Is this a new code (un-compressed), or
            //  the use of existing codes (compressed)?
            boolean isMaskSet = (flag & mask) > 0;
-            if( isMaskSet && !maskMeansCompressed ) {
+            if( isMaskSet ^ maskMeansCompressed ) {
               // Retrieve the un-compressed code
               if( (dataI = src.read()) != -1) {
                  // Save the byte into the dictionary
@ -139,11 +153,15 @@ public abstract class LZWDecompresser {

               // Build up how long the code sequence is, and
               //  what position of the code to start at
-               // (The position is the first 12 bits, the
-               //  length is the last 4 bits)
+               // (The position is the usually the first 12 bits, 
+               //  and the length is usually the last 4 bits)
               len = (dataIPt2 & 15) + codeLengthIncrease;
-               pntr = (dataIPt2 & 240)*16 + dataIPt1;
-
+               if(positionIsBigEndian) {
+                  pntr = (dataIPt1<<4) + (dataIPt2>>4);
+               } else {
+                  pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
+               }
+               
               // Adjust the pointer as needed
               pntr = adjustDictionaryOffset(pntr);

--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@ -37,8 +37,10 @@ import org.apache.poi.util.LZWDecompresser;
 */
 public class HDGFLZW extends LZWDecompresser {
   public HDGFLZW() {
-      // We're the wrong way round!
-      super(false, 3);
+      // Out flag is the wrong way round!
+      // Length wise, we're 3 longer than we say, so the max len is 19
+      // Endian wise, we're little endian, so 0x1234 is pos 0x312
+      super(false, 3, false);
   }

   /**
@ -63,12 +65,13 @@ public class HDGFLZW extends LZWDecompresser {
      }
      return pntr;
   }
-
+   
   /**
    * We want an empty dictionary, so do nothing
    */
   @Override
-   protected void populateDictionary(byte[] dict) {
+   protected int populateDictionary(byte[] dict) {
+      return 0;
   }

   /**
--- a/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java
+++ b/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java
@ -54,7 +54,10 @@ public final class CompressedRTF extends LZWDecompresser {
      "{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
   
   public CompressedRTF() {
-      super(true, 2);
+      // Out flag has the normal meaning
+      // Length wise, we're 2 longer than we say, so the max len is 18
+      // Endian wise, we're big endian, so 0x1234 is pos 0x123
+      super(true, 2, true);
   }

   public void decompress(InputStream src, OutputStream res) throws IOException {
@ -80,17 +83,24 @@ public final class CompressedRTF extends LZWDecompresser {
      super.decompress(src, res);
   }

+   /**
+    * We use regular dictionary offsets, so no
+    *  need to change anything
+    */
   @Override
   protected int adjustDictionaryOffset(int offset) {
-      // TODO Do we need to change anything?
-      return 0;
+      return offset;
   }

   @Override
-   protected void populateDictionary(byte[] dict) {
+   protected int populateDictionary(byte[] dict) {
      try {
+         // Copy in the RTF constants 
         byte[] preload = LZW_RTF_PRELOAD.getBytes("US-ASCII");
         System.arraycopy(preload, 0, dict, 0, preload.length);
+         
+         // Start adding new codes after the constants
+         return preload.length;
      } catch(UnsupportedEncodingException e) {
         throw new RuntimeException("Your JVM is broken as it doesn't support US ASCII");
      }
--- a/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java
+++ b/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java
@ -93,7 +93,7 @@ public final class TestCompressedRTF extends TestCase {
     * Check that we can decode the first 8 codes
     * (1 flag byte + 8 codes)  
     */
-    public void DISABLEDtestFirstBlock() throws Exception {
+    public void testFirstBlock() throws Exception {
       HMEFMessage msg = new HMEFMessage(
             _samples.openResourceAsStream("quick-winmail.dat")
       );
@ -112,7 +112,6 @@ public final class TestCompressedRTF extends TestCase {
       String decompStr = new String(decomp, "ASCII");
       
       // Test
-System.err.println(decompStr);       
       assertEquals(block1.length(), decomp.length);
       assertEquals(block1, decompStr);
    }
@ -121,7 +120,7 @@ System.err.println(decompStr);
     * Check that we can decode the first 16 codes
     * (flag + 8 codes, flag + 8 codes)  
     */
-    public void DISABLEDtestFirstTwoBlocks() throws Exception {
+    public void testFirstTwoBlocks() throws Exception {
       HMEFMessage msg = new HMEFMessage(
             _samples.openResourceAsStream("quick-winmail.dat")
       );
@ -140,7 +139,6 @@ System.err.println(decompStr);
       String decompStr = new String(decomp, "ASCII");
       
       // Test
-System.err.println(decompStr);       
       assertEquals(block2.length(), decomp.length);
       assertEquals(block2, decompStr);
    }