From 0df8c11c4b066aac316614d181f6fc9f5fae5fce Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sat, 5 Mar 2011 15:25:39 +0000 Subject: [PATCH] Two more differences between the LZW in HDGF and HMEF: * Little Endian vs Big Endian storage of the code position * Initial dictionary position is the end of pre-fill, if there is one, rather than always being position 0 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1078300 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/util/LZWDecompresser.java | 58 ++++++++++++------- .../src/org/apache/poi/hdgf/HDGFLZW.java | 11 ++-- .../org/apache/poi/hmef/CompressedRTF.java | 18 ++++-- .../apache/poi/hmef/TestCompressedRTF.java | 6 +- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java index 51926b6c2..91aeb23b9 100644 --- a/src/java/org/apache/poi/util/LZWDecompresser.java +++ b/src/java/org/apache/poi/util/LZWDecompresser.java @@ -41,23 +41,36 @@ public abstract class LZWDecompresser { * to get the real code length? Normally 2 or 3 */ private final int codeLengthIncrease; + /** + * Does the 12 bits of the position get stored in + * Little Endian or Big Endian form? + * This controls whether a pos+length of 0x12 0x34 + * becomes a position of 0x123 or 0x312 + */ + private final boolean positionIsBigEndian; - protected LZWDecompresser(boolean maskMeansCompressed, int codeLengthIncrease) { + protected LZWDecompresser(boolean maskMeansCompressed, + int codeLengthIncrease, boolean positionIsBigEndian) { this.maskMeansCompressed = maskMeansCompressed; this.codeLengthIncrease = codeLengthIncrease; + this.positionIsBigEndian = positionIsBigEndian; } /** - * Populates the dictionary. May not need - * to do anything if all zeros is fine. + * Populates the dictionary, and returns where in it + * to begin writing new codes. + * Generally, if the dictionary is pre-populated, then new + * codes should be placed at the end of that block. + * Equally, if the dictionary is left with all zeros, then + * usually the new codes can go in at the start. */ - protected abstract void populateDictionary(byte[] dict); + protected abstract int populateDictionary(byte[] dict); /** * Adjusts the position offset if needed when looking * something up in the dictionary. */ - protected abstract int adjustDictionaryOffset(int offset); + protected abstract int adjustDictionaryOffset(int offset); /** * Decompresses the given input stream, returning the array of bytes @@ -83,17 +96,10 @@ public abstract class LZWDecompresser { * flag byte */ public void decompress(InputStream src, OutputStream res) throws IOException { - // We use 12 bit codes: - // * 0-255 are real bytes - // * 256-4095 are the substring codes - // Java handily initialises our buffer / dictionary - // to all zeros - byte[] buffer = new byte[4096]; - populateDictionary(buffer); - // How far through the output we've got // (This is normally used &4095, so it nicely wraps) - int pos = 0; + // The initial value is set when populating the dictionary + int pos; // The flag byte is treated as its 8 individual // bits, which tell us if the following 8 codes // are compressed or un-compressed @@ -102,10 +108,18 @@ public abstract class LZWDecompresser { // processing each bit of the flag byte in turn int mask; + // We use 12 bit codes: + // * 0-255 are real bytes + // * 256-4095 are the substring codes + // Java handily initialises our buffer / dictionary + // to all zeros + byte[] buffer = new byte[4096]; + pos = populateDictionary(buffer); + // These are bytes as looked up in the dictionary // It needs to be signed, as it'll get passed on to // the output stream - byte[] dataB = new byte[19]; + byte[] dataB = new byte[16+codeLengthIncrease]; // This is an unsigned byte read from the stream // It needs to be unsigned, so that bit stuff works int dataI; @@ -121,7 +135,7 @@ public abstract class LZWDecompresser { // Is this a new code (un-compressed), or // the use of existing codes (compressed)? boolean isMaskSet = (flag & mask) > 0; - if( isMaskSet && !maskMeansCompressed ) { + if( isMaskSet ^ maskMeansCompressed ) { // Retrieve the un-compressed code if( (dataI = src.read()) != -1) { // Save the byte into the dictionary @@ -139,11 +153,15 @@ public abstract class LZWDecompresser { // Build up how long the code sequence is, and // what position of the code to start at - // (The position is the first 12 bits, the - // length is the last 4 bits) + // (The position is the usually the first 12 bits, + // and the length is usually the last 4 bits) len = (dataIPt2 & 15) + codeLengthIncrease; - pntr = (dataIPt2 & 240)*16 + dataIPt1; - + if(positionIsBigEndian) { + pntr = (dataIPt1<<4) + (dataIPt2>>4); + } else { + pntr = dataIPt1 + ((dataIPt2&0xF0)<<4); + } + // Adjust the pointer as needed pntr = adjustDictionaryOffset(pntr); diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java index f122c40f1..e6d4aa2e6 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java @@ -37,8 +37,10 @@ import org.apache.poi.util.LZWDecompresser; */ public class HDGFLZW extends LZWDecompresser { public HDGFLZW() { - // We're the wrong way round! - super(false, 3); + // Out flag is the wrong way round! + // Length wise, we're 3 longer than we say, so the max len is 19 + // Endian wise, we're little endian, so 0x1234 is pos 0x312 + super(false, 3, false); } /** @@ -63,12 +65,13 @@ public class HDGFLZW extends LZWDecompresser { } return pntr; } - + /** * We want an empty dictionary, so do nothing */ @Override - protected void populateDictionary(byte[] dict) { + protected int populateDictionary(byte[] dict) { + return 0; } /** diff --git a/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java b/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java index 81218bc9b..70c3e5929 100644 --- a/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java +++ b/src/scratchpad/src/org/apache/poi/hmef/CompressedRTF.java @@ -54,7 +54,10 @@ public final class CompressedRTF extends LZWDecompresser { "{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"; public CompressedRTF() { - super(true, 2); + // Out flag has the normal meaning + // Length wise, we're 2 longer than we say, so the max len is 18 + // Endian wise, we're big endian, so 0x1234 is pos 0x123 + super(true, 2, true); } public void decompress(InputStream src, OutputStream res) throws IOException { @@ -80,17 +83,24 @@ public final class CompressedRTF extends LZWDecompresser { super.decompress(src, res); } + /** + * We use regular dictionary offsets, so no + * need to change anything + */ @Override protected int adjustDictionaryOffset(int offset) { - // TODO Do we need to change anything? - return 0; + return offset; } @Override - protected void populateDictionary(byte[] dict) { + protected int populateDictionary(byte[] dict) { try { + // Copy in the RTF constants byte[] preload = LZW_RTF_PRELOAD.getBytes("US-ASCII"); System.arraycopy(preload, 0, dict, 0, preload.length); + + // Start adding new codes after the constants + return preload.length; } catch(UnsupportedEncodingException e) { throw new RuntimeException("Your JVM is broken as it doesn't support US ASCII"); } diff --git a/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java b/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java index ad8f6692b..36991c43c 100644 --- a/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java +++ b/src/scratchpad/testcases/org/apache/poi/hmef/TestCompressedRTF.java @@ -93,7 +93,7 @@ public final class TestCompressedRTF extends TestCase { * Check that we can decode the first 8 codes * (1 flag byte + 8 codes) */ - public void DISABLEDtestFirstBlock() throws Exception { + public void testFirstBlock() throws Exception { HMEFMessage msg = new HMEFMessage( _samples.openResourceAsStream("quick-winmail.dat") ); @@ -112,7 +112,6 @@ public final class TestCompressedRTF extends TestCase { String decompStr = new String(decomp, "ASCII"); // Test -System.err.println(decompStr); assertEquals(block1.length(), decomp.length); assertEquals(block1, decompStr); } @@ -121,7 +120,7 @@ System.err.println(decompStr); * Check that we can decode the first 16 codes * (flag + 8 codes, flag + 8 codes) */ - public void DISABLEDtestFirstTwoBlocks() throws Exception { + public void testFirstTwoBlocks() throws Exception { HMEFMessage msg = new HMEFMessage( _samples.openResourceAsStream("quick-winmail.dat") ); @@ -140,7 +139,6 @@ System.err.println(decompStr); String decompStr = new String(decomp, "ASCII"); // Test -System.err.println(decompStr); assertEquals(block2.length(), decomp.length); assertEquals(block2, decompStr); }