From cbf86ed0bcb01cba2de2400cb3daa1cebd7a1fbb Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 16 Dec 2010 07:41:41 +0000 Subject: [PATCH] Improve the HDGF LZW code. Some tweaks to the decompression, and more tests, but mostly work on the compression side. We can now compress small streams properly, and these round-trip fine. However, some longer streams don't compress correctly, and more work on that is still needed (see the disabled unit test) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049805 13f79535-47bb-0310-9956-ffa450edef68 --- .../content/xdocs/hdgf/index.xml | 3 +- src/documentation/content/xdocs/status.xml | 2 + .../src/org/apache/poi/hdgf/HDGFLZW.java | 497 +++++------------- .../apache/poi/hdgf/HDGFLZWCompressor.java | 241 +++++++++ .../org/apache/poi/hdgf/TestHDGFLZW.java | 107 +++- 5 files changed, 482 insertions(+), 368 deletions(-) create mode 100644 src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java diff --git a/src/documentation/content/xdocs/hdgf/index.xml b/src/documentation/content/xdocs/hdgf/index.xml index 9e4ca894c..fc24c108d 100644 --- a/src/documentation/content/xdocs/hdgf/index.xml +++ b/src/documentation/content/xdocs/hdgf/index.xml @@ -72,7 +72,8 @@ HDGFLZW, which will be much better documented, and also under the ASL. Completed October 2007
  • Add compression support to HDGFLZW. - In progress
  • + In progress - works for small streams but encoding + goes wrong on larger ones
  • Have HDGF just write back the raw bytes it read in, and have a test to ensure the file is un-changed.
  • Have HDGF generate the bytes to write out from the diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 589ce752c..fed871344 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,8 @@ + Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them + Partial HDGF LZW compression support 50244 - Support for continued NameRecords 50416 - Correct shifting of the first or last row in a sheet by multiple rows 50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not) diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java index 0595343d4..d6d8d6b76 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java @@ -35,358 +35,147 @@ import java.io.OutputStream; */ public class HDGFLZW { -/** - * Given an integer, turn it into a java byte, handling - * the wrapping. - * This is a convenience method - */ -public static byte fromInt(int b) { - if(b < 128) return (byte)b; - return (byte)(b - 256); -} -/** - * Given a java byte, turn it into an integer between 0 - * and 255 (i.e. handle the unwrapping). - * This is a convenience method - */ -public static int fromByte(byte b) { - if(b >= 0) { - return b; - } - return b + 256; -} - -/** - * Compress the given input stream, returning the array of bytes - * of the compressed input - */ -public byte[] compress(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - compress(src,res); - return res.toByteArray(); -} - -/** - * Decompresses the given input stream, returning the array of bytes - * of the decompressed input. - */ -public byte[] decode(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - decode(src,res); - return res.toByteArray(); -} -/** - * Perform a streaming decompression of the input. - * Works by: - * 1) Reading a flag byte, the 8 bits of which tell you if the - * following 8 codes are compressed our un-compressed - * 2) Consider the 8 bits in turn - * 3) If the bit is set, the next code is un-compressed, so - * add it to the dictionary and output it - * 4) If the bit isn't set, then read in the length and start - * position in the dictionary, and output the bytes there - * 5) Loop until we've done all 8 bits, then read in the next - * flag byte - */ -public void decode(InputStream src, OutputStream res) throws IOException { - // We use 12 bit codes: - // * 0-255 are real bytes - // * 256-4095 are the substring codes - // Java handily initialises our buffer / dictionary - // to all zeros - byte[] buffer = new byte[4096]; - - // How far through the output we've got - // (This is normally used &4095, so it nicely wraps) - int pos = 0; - // The flag byte is treated as its 8 individual - // bits, which tell us if the following 8 codes - // are compressed or un-compressed - int flag; - // The mask, between 1 and 255, which is used when - // processing each bit of the flag byte in turn - int mask; - - // This is a byte as looked up in the dictionary - // It needs to be signed, as it'll get passed on to - // the output stream - byte dataB; - // This is an unsigned byte read from the stream - // It needs to be unsigned, so that bit stuff works - int dataI; - // The compressed code sequence is held over 2 bytes - int dataIPt1, dataIPt2; - // How long a code sequence is, and where in the - // dictionary to start at - int len, pntr; - - while( (flag = src.read()) != -1 ) { - // Compare each bit in our flag byte in turn: - for(mask = 1; mask < 256 ; mask <<= 1) { - // Is this a new code (un-compressed), or - // the use of existing codes (compressed)? - if( (flag & mask) > 0 ) { - // Retrieve the un-compressed code - if( (dataI = src.read()) != -1) { - // Save the byte into the dictionary - buffer[(pos&4095)] = fromInt(dataI); - pos++; - // And output the byte - res.write( new byte[] {fromInt(dataI)} ); - } - } else { - // We have a compressed sequence - // Grab the next 16 bits of data - dataIPt1 = src.read(); - dataIPt2 = src.read(); - if(dataIPt1 == -1 || dataIPt2 == -1) break; - - // Build up how long the code sequence is, and - // what position of the code to start at - // (The position is the first 12 bits, the - // length is the last 4 bits) - len = (dataIPt2 & 15) + 3; - pntr = (dataIPt2 & 240)*16 + dataIPt1; - - // If the pointer happens to be passed the end - // of our buffer, then wrap around - if(pntr > 4078) { - pntr = pntr - 4078; - } else { - pntr = pntr + 18; - } - - // Loop over the codes, outputting what they correspond to - for(int i=0; i=0; i--) { - boolean matches = true; - for(int j=0; matches && j 0) { - outputCompressed(res); - break; - } - - // Try adding this new byte onto rawCode, and - // see if all of that is still found in the - // buffer dictionary or not - rawCode[rawCodeLen] = dataB; - rawCodeLen++; - int rawAt = findRawCodeInBuffer(); - - // If we found it and are now at 16 bytes, - // we need to output our pending code block - if(rawCodeLen == 16 && rawAt > -1) { - outputCompressed(res); - rawCodeLen = 0; - continue; - } - - // If we did find all of rawCode with our new - // byte added on, we can wait to see what happens - // with the next byte - if(rawAt > -1) { - continue; - } - - // If we get here, then the rawCode + this byte weren't - // found in the dictionary - - // If there was something in rawCode before, then that was - // found in the dictionary, so output that compressed - rawCodeLen--; - if(rawCodeLen > 0) { - // Output the old rawCode - outputCompressed(res); - - // Can this byte start a new rawCode, or does - // it need outputting itself? - rawCode[0] = dataB; - rawCodeLen = 1; - if(findRawCodeInBuffer() > -1) { - // Fits in, wait for next byte - continue; - } - // Doesn't fit, output - outputUncompressed(dataB,res); - rawCodeLen = 0; - } else { - // Nothing in rawCode before, so this byte - // isn't in the buffer dictionary - // Output it un-compressed - outputUncompressed(dataB,res); - } - } -} -} - + /** + * Given an integer, turn it into a java byte, handling + * the wrapping. + * This is a convenience method + */ + public static byte fromInt(int b) { + if(b < 128) return (byte)b; + return (byte)(b - 256); + } + /** + * Given a java byte, turn it into an integer between 0 + * and 255 (i.e. handle the unwrapping). + * This is a convenience method + */ + public static int fromByte(byte b) { + if(b >= 0) { + return b; + } + return b + 256; + } + + /** + * Compress the given input stream, returning the array of bytes + * of the compressed input + */ + public byte[] compress(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + compress(src,res); + return res.toByteArray(); + } + + /** + * Decompresses the given input stream, returning the array of bytes + * of the decompressed input. + */ + public byte[] decode(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + decode(src,res); + return res.toByteArray(); + } + + /** + * Perform a streaming decompression of the input. + * Works by: + * 1) Reading a flag byte, the 8 bits of which tell you if the + * following 8 codes are compressed our un-compressed + * 2) Consider the 8 bits in turn + * 3) If the bit is set, the next code is un-compressed, so + * add it to the dictionary and output it + * 4) If the bit isn't set, then read in the length and start + * position in the dictionary, and output the bytes there + * 5) Loop until we've done all 8 bits, then read in the next + * flag byte + */ + public void decode(InputStream src, OutputStream res) throws IOException { + // We use 12 bit codes: + // * 0-255 are real bytes + // * 256-4095 are the substring codes + // Java handily initialises our buffer / dictionary + // to all zeros + byte[] buffer = new byte[4096]; + + // How far through the output we've got + // (This is normally used &4095, so it nicely wraps) + int pos = 0; + // The flag byte is treated as its 8 individual + // bits, which tell us if the following 8 codes + // are compressed or un-compressed + int flag; + // The mask, between 1 and 255, which is used when + // processing each bit of the flag byte in turn + int mask; + + // These are bytes as looked up in the dictionary + // It needs to be signed, as it'll get passed on to + // the output stream + byte[] dataB = new byte[19]; + // This is an unsigned byte read from the stream + // It needs to be unsigned, so that bit stuff works + int dataI; + // The compressed code sequence is held over 2 bytes + int dataIPt1, dataIPt2; + // How long a code sequence is, and where in the + // dictionary to start at + int len, pntr; + + while( (flag = src.read()) != -1 ) { + // Compare each bit in our flag byte in turn: + for(mask = 1; mask < 256 ; mask <<= 1) { + // Is this a new code (un-compressed), or + // the use of existing codes (compressed)? + if( (flag & mask) > 0 ) { + // Retrieve the un-compressed code + if( (dataI = src.read()) != -1) { + // Save the byte into the dictionary + buffer[(pos&4095)] = fromInt(dataI); + pos++; + // And output the byte + res.write( new byte[] {fromInt(dataI)} ); + } + } else { + // We have a compressed sequence + // Grab the next 16 bits of data + dataIPt1 = src.read(); + dataIPt2 = src.read(); + if(dataIPt1 == -1 || dataIPt2 == -1) break; + + // Build up how long the code sequence is, and + // what position of the code to start at + // (The position is the first 12 bits, the + // length is the last 4 bits) + len = (dataIPt2 & 15) + 3; + pntr = (dataIPt2 & 240)*16 + dataIPt1; + + // If the pointer happens to be passed the end + // of our buffer, then wrap around + if(pntr > 4078) { + pntr = pntr - 4078; + } else { + pntr = pntr + 18; + } + + // Loop over the codes, outputting what they correspond to + for(int i=0; i0; i--) { + boolean matches = true; + for(int j=0; matches && j> 4); + buffer[bufferLen] = HDGFLZW.fromInt(bp1); + bufferLen++; + buffer[bufferLen] = HDGFLZW.fromInt(bp2); + bufferLen++; + + // Copy the data to the dictionary in the new place + for(int i=0; i 0) { + outputCompressed(res); + if(maskBitsSet > 0) { + output8Codes(res); + } + } + break; + } + + // Try adding this new byte onto rawCode, and + // see if all of that is still found in the + // buffer dictionary or not + rawCode[rawCodeLen] = dataB; + rawCodeLen++; + int rawAt = findRawCodeInBuffer(); + + // If we found it and are now at 18 bytes, + // we need to output our pending code block + if(rawCodeLen == 18 && rawAt > -1) { + outputCompressed(res); + rawCodeLen = 0; + continue; + } + + // If we did find all of rawCode with our new + // byte added on, we can wait to see what happens + // with the next byte + if(rawAt > -1) { + continue; + } + + // If we get here, then the rawCode + this byte weren't + // found in the dictionary + + // If there was something in rawCode before, then that was + // found in the dictionary, so output that compressed + rawCodeLen--; + if(rawCodeLen > 0) { + // Output the old rawCode + outputCompressed(res); + + // Can this byte start a new rawCode, or does + // it need outputting itself? + rawCode[0] = dataB; + rawCodeLen = 1; + if(findRawCodeInBuffer() > -1) { + // Fits in, wait for next byte + continue; + } + // Doesn't fit, output + outputUncompressed(dataB,res); + rawCodeLen = 0; + } else { + // Nothing in rawCode before, so this byte + // isn't in the buffer dictionary + // Output it un-compressed + outputUncompressed(dataB,res); + } + } +} +} \ No newline at end of file diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java index f3af7c375..b997ce5c4 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java @@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase { -21, -16, // 3 @ 4093 1, 0, 0, -72, -13, -16, // 3 @ 5 - 78, // *mask bit* + 78, // *mask bit* 2,3,4,7 -32, -5, // 14 @ 4082 1, 0, 3, -21, -16, // 3 @ 4093 10, 5, // 8 @ 28 4, -21, -16, // 3 @ 4093 - 21, // *mask bit* + 21, // *mask bit* 1,3,5 9, -21, -16, // 3 @ 4093 - 103, -21, -16, 34, + 103, + -21, -16, // 3 @ 4093 + 34, -36, -1, // 18 @ 4078 52, 15, // 18 @ 70 70, 15, // 18 @ 88 @@ -169,19 +171,98 @@ public final class TestHDGFLZW extends TestCase { } } - public void DISABLEDtestCompress() throws Exception { - assertEquals(339, testTrailerComp.length); - assertEquals(632, testTrailerDecomp.length); + /** + * Test that we can round-trip a little bit. + * Uses a part short enough that we agree with visio + * on the best way to compress it + */ + public void testCompressMini() throws Exception { + // first 11 bytes compressed = 12 bytes uncompressed + byte[] sourceComp = new byte[11]; + byte[] sourceDecomp = new byte[12]; + System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length); + System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length); // Compress it using our engine HDGFLZW lzw = new HDGFLZW(); - byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp)); + byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp)); + + // Now decompress it again + byte[] decomp = lzw.decode(new ByteArrayInputStream(comp)); - // Now check it's the right data - assertEquals(339, comp.length); - for(int i=0; i 11 + // Next 32 -> 13 + byte[] sourceComp = new byte[24]; + byte[] sourceDecomp = new byte[44]; + System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length); + System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length); + + // Compress it using our engine + HDGFLZW lzw = new HDGFLZW(); + byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp)); + + // We should be 3 characters bigger, as + // we split one compressed bit into two + assertEquals(27, comp.length); + + // Now decompress it again + byte[] decomp = lzw.decode(new ByteArrayInputStream(comp)); + + // We can only check the round-tripping, as for now + // visio cheats on re-using a block + assertEquals(44, decomp.length); + for(int i=0; i