Replace the HDGW LZW engine with a fully documented, ASL licenced version. (Doesn't do compression yet, but is a much better start for that)

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@584414 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-10-13 15:46:09 +00:00
parent 6b70b7cfad
commit 786af85cc0
4 changed files with 166 additions and 85 deletions

View File

@ -0,0 +1,161 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hdgf;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
/**
* A decoder for the crazy LZW implementation used
* in Visio.
* According to VSDump, "it's a slightly perverted version of LZW
* compression, with inverted meaning of flag byte and 0xFEE as an
* 'initial shift'". It uses 12 bit codes
* (http://www.gnome.ru/projects/vsdump_en.html)
*
* Two good resources on LZW are:
* http://en.wikipedia.org/wiki/LZW
* http://marknelson.us/1989/10/01/lzw-data-compression/
*/
public class HDGFLZW {
/**
* Given an integer, turn it into a java byte, handling
* the wrapping.
* This is a convenience method
*/
public byte fromInt(int b) {
if(b < 128) return (byte)b;
return (byte)(b - 256);
}
/**
* Given a java byte, turn it into an integer between 0
* and 255 (i.e. handle the unwrapping).
* This is a convenience method
*/
public int fromByte(byte b) {
if(b >= 0) return (int)b;
return (int)(b + 256);
}
/**
* Decompresses the given input stream, returning the array of bytes
* of the decompressed input.
*/
public byte[] decode(InputStream src) throws IOException {
ByteArrayOutputStream res = new ByteArrayOutputStream();
decode(src,res);
return res.toByteArray();
}
/**
* Perform a streaming decompression of the input.
* Works by:
* 1) Reading a flag byte, the 8 bits of which tell you if the
* following 8 codes are compressed our un-compressed
* 2) Consider the 8 bits in turn
* 3) If the bit is set, the next code is un-compressed, so
* add it to the dictionary and output it
* 4) If the bit isn't set, then read in the length and start
* position in the dictionary, and output the bytes there
* 5) Loop until we've done all 8 bits, then read in the next
* flag byte
*/
public void decode(InputStream src, OutputStream res) throws IOException {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] buffer = new byte[4096];
// How far through the output we've got
// (This is normally used &4095, so it nicely wraps)
int pos = 0;
// The flag byte is treated as its 8 individual
// bits, which tell us if the following 8 codes
// are compressed or un-compressed
int flag;
// The mask, between 1 and 255, which is used when
// processing each bit of the flag byte in turn
int mask;
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
byte dataB;
// This is an unsigned byte read from the stream
// It needs to be unsigned, so that bit stuff works
int dataI;
// The compressed code sequence is held over 2 bytes
int dataIPt1, dataIPt2;
// How long a code sequence is, and where in the
// dictionary to start at
int len, pntr;
while( (flag = src.read()) != -1 ) {
// Compare each bit in our flag byte in turn:
for(mask = 1; mask < 256 ; mask <<= 1) {
// Is this a new code (un-compressed), or
// the use of existing codes (compressed)?
if( (flag & mask) > 0 ) {
// Retrieve the un-compressed code
if( (dataI = src.read()) != -1) {
// Save the byte into the dictionary
buffer[(pos&4095)] = fromInt(dataI);
pos++;
// And output the byte
res.write( new byte[] {fromInt(dataI)} );
}
} else {
// We have a compressed sequence
// Grab the next 16 bits of data
dataIPt1 = src.read();
dataIPt2 = src.read();
if(dataIPt1 == -1 || dataIPt2 == -1) break;
// Build up how long the code sequence is, and
// what position of the code to start at
// (The position is the first 12 bits, the
// length is the last 4 bits)
len = (dataIPt2 & 15) + 3;
pntr = (dataIPt2 & 240)*16 + dataIPt1;
// If the pointer happens to be passed the end
// of our buffer, then wrap around
if(pntr > 4078) {
pntr = pntr - 4078;
} else {
pntr = pntr + 18;
}
// Loop over the codes, outputting what they correspond to
for(int i=0; i<len; i++) {
buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
dataB = buffer[(pntr + i) & 4095];
res.write(new byte[] {dataB});
}
// Record how far along the stream we have moved
pos = pos + len;
}
}
}
}
}

View File

@ -1,80 +0,0 @@
/* ====================================================================
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
==================================================================== */
package org.apache.poi.hdgf;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* A decoder for the crazy LZW implementation used
* in Visio.
* This is a port of vsd_inflate.c from vsdump
* (http://www.gnome.ru/projects/vsdump_en.html)
*/
public class LZW4HDGF {
public byte fromInt(int b) {
if(b < 128) return (byte)b;
return (byte)(b - 256);
}
public byte[] decode(InputStream src) throws IOException {
ByteArrayOutputStream res = new ByteArrayOutputStream();
int pos = 0;
int flag;
byte[] buffer = new byte[4096];
buffer[0] = 0;
byte data;
int tmp;
int addr1, addr2;
int len, pntr;
while ( (flag = src.read()) != -1 ) {
for (int mask = 1; mask < 0x100 ; mask <<= 1) {
if ( (flag & mask) > 0) {
if( (tmp = src.read()) != -1) {
buffer[(pos&4095)] = fromInt(tmp);
pos++;
res.write( new byte[] {fromInt(tmp)} );
}
} else {
tmp = src.read();
if(tmp == -1) break;
addr1 = tmp;
tmp = src.read();
if(tmp == -1) break;
addr2 = tmp;
len = (addr2 & 15) + 3;
pntr = (addr2 & 240)*16 + addr1;
if(pntr > 4078) {
pntr = pntr - 4078;
} else {
pntr = pntr + 18;
}
for(int i=0; i<len; i++) {
buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
data = buffer[(pntr + i ) & 4095];
res.write(new byte[] {data});
}
pos = pos + len;
}
}
}
return res.toByteArray();
}
}

View File

@ -19,7 +19,7 @@ package org.apache.poi.hdgf.streams;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.apache.poi.hdgf.LZW4HDGF;
import org.apache.poi.hdgf.HDGFLZW;
/**
* A StreamStore where the data on-disk is compressed,
@ -76,7 +76,7 @@ public class CompressedStreamStore extends StreamStore {
ByteArrayInputStream bais = new ByteArrayInputStream(data, offset, length);
// Decompress
LZW4HDGF lzw = new LZW4HDGF();
HDGFLZW lzw = new HDGFLZW();
byte[] decompressed = lzw.decode(bais);
// Split into header and contents

View File

@ -20,7 +20,7 @@ import java.io.ByteArrayInputStream;
import junit.framework.TestCase;
public class TestLZW4HDGF extends TestCase {
public class TestHDGFLZW extends TestCase {
public static final byte[] testTrailerComp = new byte[] {
123, -60, 2, -21, -16, 1, 0, 0, -72, -13, -16, 78, -32, -5, 1,
0, 3, -21, -16, 10, 5, 4, -21, -16, 21, 9, -21, -16, 103, -21,
@ -86,8 +86,8 @@ public class TestLZW4HDGF extends TestCase {
assertEquals(632, testTrailerDecomp.length);
// Decode it using our engine
LZW4HDGF lzw2 = new LZW4HDGF();
byte[] dec = lzw2.decode(new ByteArrayInputStream(testTrailerComp));
HDGFLZW lzw = new HDGFLZW();
byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
// Check it's of the right size
assertEquals(632, dec.length);