Two more differences between the LZW in HDGF and HMEF:

* Little Endian vs Big Endian storage of the code position
 * Initial dictionary position is the end of pre-fill, if there is one, rather than always being position 0


git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1078300 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-03-05 15:25:39 +00:00
parent 2c4134e89b
commit 0df8c11c4b
4 changed files with 61 additions and 32 deletions

View File

@ -41,17 +41,30 @@ public abstract class LZWDecompresser {
* to get the real code length? Normally 2 or 3 * to get the real code length? Normally 2 or 3
*/ */
private final int codeLengthIncrease; private final int codeLengthIncrease;
/**
* Does the 12 bits of the position get stored in
* Little Endian or Big Endian form?
* This controls whether a pos+length of 0x12 0x34
* becomes a position of 0x123 or 0x312
*/
private final boolean positionIsBigEndian;
protected LZWDecompresser(boolean maskMeansCompressed, int codeLengthIncrease) { protected LZWDecompresser(boolean maskMeansCompressed,
int codeLengthIncrease, boolean positionIsBigEndian) {
this.maskMeansCompressed = maskMeansCompressed; this.maskMeansCompressed = maskMeansCompressed;
this.codeLengthIncrease = codeLengthIncrease; this.codeLengthIncrease = codeLengthIncrease;
this.positionIsBigEndian = positionIsBigEndian;
} }
/** /**
* Populates the dictionary. May not need * Populates the dictionary, and returns where in it
* to do anything if all zeros is fine. * to begin writing new codes.
* Generally, if the dictionary is pre-populated, then new
* codes should be placed at the end of that block.
* Equally, if the dictionary is left with all zeros, then
* usually the new codes can go in at the start.
*/ */
protected abstract void populateDictionary(byte[] dict); protected abstract int populateDictionary(byte[] dict);
/** /**
* Adjusts the position offset if needed when looking * Adjusts the position offset if needed when looking
@ -83,17 +96,10 @@ public abstract class LZWDecompresser {
* flag byte * flag byte
*/ */
public void decompress(InputStream src, OutputStream res) throws IOException { public void decompress(InputStream src, OutputStream res) throws IOException {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
populateDictionary(buffer);
// How far through the output we've got // How far through the output we've got
// (This is normally used &4095, so it nicely wraps) // (This is normally used &4095, so it nicely wraps)
int pos = 0; // The initial value is set when populating the dictionary
int pos;
// The flag byte is treated as its 8 individual // The flag byte is treated as its 8 individual
// bits, which tell us if the following 8 codes // bits, which tell us if the following 8 codes
// are compressed or un-compressed // are compressed or un-compressed
@ -102,10 +108,18 @@ public abstract class LZWDecompresser {
// processing each bit of the flag byte in turn // processing each bit of the flag byte in turn
int mask; int mask;
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
pos = populateDictionary(buffer);
// These are bytes as looked up in the dictionary // These are bytes as looked up in the dictionary
// It needs to be signed, as it'll get passed on to // It needs to be signed, as it'll get passed on to
// the output stream // the output stream
byte[] dataB = new byte[19]; byte[] dataB = new byte[16+codeLengthIncrease];
// This is an unsigned byte read from the stream // This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works // It needs to be unsigned, so that bit stuff works
int dataI; int dataI;
@ -121,7 +135,7 @@ public abstract class LZWDecompresser {
// Is this a new code (un-compressed), or // Is this a new code (un-compressed), or
// the use of existing codes (compressed)? // the use of existing codes (compressed)?
boolean isMaskSet = (flag & mask) > 0; boolean isMaskSet = (flag & mask) > 0;
if( isMaskSet && !maskMeansCompressed ) { if( isMaskSet ^ maskMeansCompressed ) {
// Retrieve the un-compressed code // Retrieve the un-compressed code
if( (dataI = src.read()) != -1) { if( (dataI = src.read()) != -1) {
// Save the byte into the dictionary // Save the byte into the dictionary
@ -139,10 +153,14 @@ public abstract class LZWDecompresser {
// Build up how long the code sequence is, and // Build up how long the code sequence is, and
// what position of the code to start at // what position of the code to start at
// (The position is the first 12 bits, the // (The position is the usually the first 12 bits,
// length is the last 4 bits) // and the length is usually the last 4 bits)
len = (dataIPt2 & 15) + codeLengthIncrease; len = (dataIPt2 & 15) + codeLengthIncrease;
pntr = (dataIPt2 & 240)*16 + dataIPt1; if(positionIsBigEndian) {
pntr = (dataIPt1<<4) + (dataIPt2>>4);
} else {
pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
}
// Adjust the pointer as needed // Adjust the pointer as needed
pntr = adjustDictionaryOffset(pntr); pntr = adjustDictionaryOffset(pntr);

View File

@ -37,8 +37,10 @@ import org.apache.poi.util.LZWDecompresser;
*/ */
public class HDGFLZW extends LZWDecompresser { public class HDGFLZW extends LZWDecompresser {
public HDGFLZW() { public HDGFLZW() {
// We're the wrong way round! // Out flag is the wrong way round!
super(false, 3); // Length wise, we're 3 longer than we say, so the max len is 19
// Endian wise, we're little endian, so 0x1234 is pos 0x312
super(false, 3, false);
} }
/** /**
@ -68,7 +70,8 @@ public class HDGFLZW extends LZWDecompresser {
* We want an empty dictionary, so do nothing * We want an empty dictionary, so do nothing
*/ */
@Override @Override
protected void populateDictionary(byte[] dict) { protected int populateDictionary(byte[] dict) {
return 0;
} }
/** /**

View File

@ -54,7 +54,10 @@ public final class CompressedRTF extends LZWDecompresser {
"{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"; "{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
public CompressedRTF() { public CompressedRTF() {
super(true, 2); // Out flag has the normal meaning
// Length wise, we're 2 longer than we say, so the max len is 18
// Endian wise, we're big endian, so 0x1234 is pos 0x123
super(true, 2, true);
} }
public void decompress(InputStream src, OutputStream res) throws IOException { public void decompress(InputStream src, OutputStream res) throws IOException {
@ -80,17 +83,24 @@ public final class CompressedRTF extends LZWDecompresser {
super.decompress(src, res); super.decompress(src, res);
} }
/**
* We use regular dictionary offsets, so no
* need to change anything
*/
@Override @Override
protected int adjustDictionaryOffset(int offset) { protected int adjustDictionaryOffset(int offset) {
// TODO Do we need to change anything? return offset;
return 0;
} }
@Override @Override
protected void populateDictionary(byte[] dict) { protected int populateDictionary(byte[] dict) {
try { try {
// Copy in the RTF constants
byte[] preload = LZW_RTF_PRELOAD.getBytes("US-ASCII"); byte[] preload = LZW_RTF_PRELOAD.getBytes("US-ASCII");
System.arraycopy(preload, 0, dict, 0, preload.length); System.arraycopy(preload, 0, dict, 0, preload.length);
// Start adding new codes after the constants
return preload.length;
} catch(UnsupportedEncodingException e) { } catch(UnsupportedEncodingException e) {
throw new RuntimeException("Your JVM is broken as it doesn't support US ASCII"); throw new RuntimeException("Your JVM is broken as it doesn't support US ASCII");
} }

View File

@ -93,7 +93,7 @@ public final class TestCompressedRTF extends TestCase {
* Check that we can decode the first 8 codes * Check that we can decode the first 8 codes
* (1 flag byte + 8 codes) * (1 flag byte + 8 codes)
*/ */
public void DISABLEDtestFirstBlock() throws Exception { public void testFirstBlock() throws Exception {
HMEFMessage msg = new HMEFMessage( HMEFMessage msg = new HMEFMessage(
_samples.openResourceAsStream("quick-winmail.dat") _samples.openResourceAsStream("quick-winmail.dat")
); );
@ -112,7 +112,6 @@ public final class TestCompressedRTF extends TestCase {
String decompStr = new String(decomp, "ASCII"); String decompStr = new String(decomp, "ASCII");
// Test // Test
System.err.println(decompStr);
assertEquals(block1.length(), decomp.length); assertEquals(block1.length(), decomp.length);
assertEquals(block1, decompStr); assertEquals(block1, decompStr);
} }
@ -121,7 +120,7 @@ System.err.println(decompStr);
* Check that we can decode the first 16 codes * Check that we can decode the first 16 codes
* (flag + 8 codes, flag + 8 codes) * (flag + 8 codes, flag + 8 codes)
*/ */
public void DISABLEDtestFirstTwoBlocks() throws Exception { public void testFirstTwoBlocks() throws Exception {
HMEFMessage msg = new HMEFMessage( HMEFMessage msg = new HMEFMessage(
_samples.openResourceAsStream("quick-winmail.dat") _samples.openResourceAsStream("quick-winmail.dat")
); );
@ -140,7 +139,6 @@ System.err.println(decompStr);
String decompStr = new String(decomp, "ASCII"); String decompStr = new String(decomp, "ASCII");
// Test // Test
System.err.println(decompStr);
assertEquals(block2.length(), decomp.length); assertEquals(block2.length(), decomp.length);
assertEquals(block2, decompStr); assertEquals(block2, decompStr);
} }