poi/src/java/org/apache/poi/hssf/record/SSTDeserializer.java

563 lines
22 KiB
Java

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache POI", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.poi.hssf.record;
import org.apache.poi.util.BinaryTree;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianConsts;
/**
* Handles the task of deserializing a SST string. The two main entry points are
*
* @author Glen Stampoultzis (glens at apache.org)
*/
class SSTDeserializer
{
private BinaryTree strings;
/** this is the number of characters we expect in the first sub-record in a subsequent continuation record */
private int continuationExpectedChars;
/** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
private String unfinishedString;
/** this is true if the string uses wide characters */
private boolean wideChar;
/** this is true if the string is a rich text string */
private boolean richText;
/** this is true if the string is a far east string or some other wierd string */
private boolean extendedText;
/** Number of formatting runs in this rich text field */
private short runCount;
/** Number of characters in current string */
private int charCount;
private int extensionLength;
public SSTDeserializer( BinaryTree strings )
{
this.strings = strings;
initVars();
}
private void initVars()
{
runCount = 0;
continuationExpectedChars = 0;
unfinishedString = "";
// bytesInCurrentSegment = 0;
// stringDataOffset = 0;
wideChar = false;
richText = false;
extendedText = false;
}
/**
* This is the starting point where strings are constructed. Note that
* strings may span across multiple continuations. Read the SST record
* carefully before beginning to hack.
*/
public void manufactureStrings( final byte[] data, final int initialOffset, short dataSize )
{
initVars();
int offset = initialOffset;
while ( ( offset - initialOffset ) < dataSize )
{
int remaining = dataSize - offset + initialOffset;
if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
{
throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
}
if ( remaining == LittleEndianConsts.SHORT_SIZE )
{
setContinuationExpectedChars( LittleEndian.getUShort( data, offset ) );
unfinishedString = "";
break;
}
charCount = LittleEndian.getUShort( data, offset );
readStringHeader( data, offset );
boolean stringContinuesOverContinuation = remaining < totalStringSize();
if ( stringContinuesOverContinuation )
{
int remainingBytes = ( initialOffset + dataSize ) - offset - stringHeaderOverhead();
setContinuationExpectedChars( charCount - calculateCharCount( remainingBytes ) );
charCount -= getContinuationExpectedChars();
}
else
{
setContinuationExpectedChars( 0 );
}
processString( data, offset, charCount );
offset += totalStringSize();
if ( getContinuationExpectedChars() != 0 )
{
break;
}
}
}
// private void dump( final byte[] data, int offset, int length )
// {
// try
// {
// System.out.println( "------------------- SST DUMP -------------------------" );
// HexDump.dump( (byte[]) data, offset, System.out, offset, length );
// }
// catch ( IOException e )
// {
// }
// catch ( ArrayIndexOutOfBoundsException e )
// {
// }
// catch ( IllegalArgumentException e )
// {
// }
// }
/**
* Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
* plain string etc) and calculates the length and offset for the string.
*
*/
private void readStringHeader( final byte[] data, final int index )
{
byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];
wideChar = ( optionFlag & 1 ) == 1;
extendedText = ( optionFlag & 4 ) == 4;
richText = ( optionFlag & 8 ) == 8;
runCount = 0;
if ( richText )
{
runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD );
}
extensionLength = 0;
if ( extendedText )
{
extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD
+ (richText ? LittleEndianConsts.SHORT_SIZE : 0) );
}
}
/**
* Reads a string or the first part of a string.
*
* @param characters the number of characters to write.
*
* @return the number of bytes written.
*/
private int processString( final byte[] data, final int dataIndex, final int characters )
{
// length is the length we store it as. not the length that is read.
int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters );
byte[] unicodeStringBuffer = new byte[length];
int offset = 0;
// Set the length in characters
LittleEndian.putUShort( unicodeStringBuffer, offset, characters );
offset += LittleEndianConsts.SHORT_SIZE;
// Set the option flags
unicodeStringBuffer[offset] = data[dataIndex + offset];
// Copy in the string data
int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD;
arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead );
// Create the unicode string
UnicodeString string = new UnicodeString( UnicodeString.sid,
(short) unicodeStringBuffer.length,
unicodeStringBuffer );
if ( isStringFinished() )
{
Integer integer = new Integer( strings.size() );
addToStringTable( strings, integer, string );
}
else
{
unfinishedString = string.getString();
}
return bytesRead;
}
private boolean isStringFinished()
{
return getContinuationExpectedChars() == 0;
}
/**
* Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
* we end up getting duplicate strings. To get around this I'm doing two things: 1. Converting rich
* text to normal text and 2. If there's a duplicate I'm adding a space onto the end. Sneaky perhaps
* but it gets the job done until we can handle this a little better.
*/
static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string )
{
if ( string.isRichText() )
string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) );
if ( string.isExtendedText() )
string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) );
boolean added = false;
while ( added == false )
{
try
{
strings.put( integer, string );
added = true;
}
catch ( Exception ignore )
{
string.setString( string.getString() + " " );
}
}
}
private int calculateCharCount( final int byte_count )
{
return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
}
/**
* Process a Continue record. A Continue record for an SST record
* contains the same kind of data that the SST record contains,
* with the following exceptions:
* <P>
* <OL>
* <LI>The string counts at the beginning of the SST record are
* not in the Continue record
* <LI>The first string in the Continue record might NOT begin
* with a size. If the last string in the previous record is
* continued in this record, the size is determined by that
* last string in the previous record; the first string will
* begin with a flag byte, followed by the remaining bytes (or
* words) of the last string from the previous
* record. Otherwise, the first string in the record will
* begin with a string length
* </OL>
*
* @param record the Continue record's byte data
*/
public void processContinueRecord( final byte[] record )
{
if ( isStringFinished() )
{
initVars();
manufactureStrings( record, 0, (short) record.length );
}
else
{
// reset the wide bit because that can change across a continuation. the fact that it's
// actually rich text doesn't change across continuations even though the rich text
// may on longer be set in the "new" option flag. confusing huh?
wideChar = ( record[0] & 1 ) == 1;
if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) )
{
processEntireContinuation( record );
}
else
{
readStringRemainder( record );
}
}
}
/**
* Reads the remainder string and any subsequent strings from the continuation record.
*
* @param record The entire continuation record data.
*/
private void readStringRemainder( final byte[] record )
{
int stringRemainderSizeInBytes = calculateByteCount( getContinuationExpectedChars() );
// stringDataOffset = LittleEndianConsts.BYTE_SIZE;
byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD
+ calculateByteCount( getContinuationExpectedChars() )];
// write the string length
LittleEndian.putShort( unicodeStringData, 0, (short) getContinuationExpectedChars() );
// write the options flag
unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText );
// copy the bytes/words making up the string; skipping
// past all the overhead of the str_data array
arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData,
SSTRecord.STRING_MINIMAL_OVERHEAD,
unicodeStringData.length - SSTRecord.STRING_MINIMAL_OVERHEAD );
// use special constructor to create the final string
UnicodeString string = new UnicodeString( UnicodeString.sid,
(short) unicodeStringData.length, unicodeStringData,
unfinishedString );
Integer integer = new Integer( strings.size() );
addToStringTable( strings, integer, string );
int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes );
manufactureStrings( record, newOffset, (short) ( record.length - newOffset ) );
}
/**
* Calculates the size of the string in bytes based on the character width
*/
private int stringSizeInBytes()
{
return calculateByteCount( charCount );
}
/**
* Calculates the size of the string in byes. This figure includes all the over
* heads for the string.
*/
private int totalStringSize()
{
return stringSizeInBytes()
+ stringHeaderOverhead()
+ LittleEndianConsts.INT_SIZE * runCount
+ extensionLength;
}
private int stringHeaderOverhead()
{
return SSTRecord.STRING_MINIMAL_OVERHEAD
+ ( richText ? LittleEndianConsts.SHORT_SIZE : 0 )
+ ( extendedText ? LittleEndianConsts.INT_SIZE : 0 );
}
private int offsetForContinuedRecord( int stringRemainderSizeInBytes )
{
return stringRemainderSizeInBytes + LittleEndianConsts.BYTE_SIZE
+ runCount * LittleEndianConsts.INT_SIZE + extensionLength;
}
private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast )
{
return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) );
}
/**
* If the continued record is so long is spans into the next continue then
* simply suck the remaining string data into the existing <code>unfinishedString</code>.
*
* @param record The data from the continuation record.
*/
private void processEntireContinuation( final byte[] record )
{
// create artificial data to create a UnicodeString
int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE;
byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE];
LittleEndian.putShort( unicodeStringData, (byte) 0, (short) calculateCharCount( dataLengthInBytes ) );
arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length );
UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData );
unfinishedString = unfinishedString + ucs.getString();
setContinuationExpectedChars( getContinuationExpectedChars() - calculateCharCount( dataLengthInBytes ) );
}
private boolean stringSpansContinuation( int continuationSizeInBytes )
{
return calculateByteCount( getContinuationExpectedChars() ) > continuationSizeInBytes;
}
/**
* @return the number of characters we expect in the first
* sub-record in a subsequent continuation record
*/
int getContinuationExpectedChars()
{
return continuationExpectedChars;
}
private void setContinuationExpectedChars( final int count )
{
continuationExpectedChars = count;
}
private int calculateByteCount( final int character_count )
{
return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
}
/**
* Copies an array from the specified source array, beginning at the
* specified position, to the specified position of the destination array.
* A subsequence of array components are copied from the source
* array referenced by <code>src</code> to the destination array
* referenced by <code>dst</code>. The number of components copied is
* equal to the <code>length</code> argument. The components at
* positions <code>srcOffset</code> through
* <code>srcOffset+length-1</code> in the source array are copied into
* positions <code>dstOffset</code> through
* <code>dstOffset+length-1</code>, respectively, of the destination
* array.
* <p>
* If the <code>src</code> and <code>dst</code> arguments refer to the
* same array object, then the copying is performed as if the
* components at positions <code>srcOffset</code> through
* <code>srcOffset+length-1</code> were first copied to a temporary
* array with <code>length</code> components and then the contents of
* the temporary array were copied into positions
* <code>dstOffset</code> through <code>dstOffset+length-1</code> of the
* destination array.
* <p>
* If <code>dst</code> is <code>null</code>, then a
* <code>NullPointerException</code> is thrown.
* <p>
* If <code>src</code> is <code>null</code>, then a
* <code>NullPointerException</code> is thrown and the destination
* array is not modified.
* <p>
* Otherwise, if any of the following is true, an
* <code>ArrayStoreException</code> is thrown and the destination is
* not modified:
* <ul>
* <li>The <code>src</code> argument refers to an object that is not an
* array.
* <li>The <code>dst</code> argument refers to an object that is not an
* array.
* <li>The <code>src</code> argument and <code>dst</code> argument refer to
* arrays whose component types are different primitive types.
* <li>The <code>src</code> argument refers to an array with a primitive
* component type and the <code>dst</code> argument refers to an array
* with a reference component type.
* <li>The <code>src</code> argument refers to an array with a reference
* component type and the <code>dst</code> argument refers to an array
* with a primitive component type.
* </ul>
* <p>
* Otherwise, if any of the following is true, an
* <code>IndexOutOfBoundsException</code> is
* thrown and the destination is not modified:
* <ul>
* <li>The <code>srcOffset</code> argument is negative.
* <li>The <code>dstOffset</code> argument is negative.
* <li>The <code>length</code> argument is negative.
* <li><code>srcOffset+length</code> is greater than
* <code>src.length</code>, the length of the source array.
* <li><code>dstOffset+length</code> is greater than
* <code>dst.length</code>, the length of the destination array.
* </ul>
* <p>
* Otherwise, if any actual component of the source array from
* position <code>srcOffset</code> through
* <code>srcOffset+length-1</code> cannot be converted to the component
* type of the destination array by assignment conversion, an
* <code>ArrayStoreException</code> is thrown. In this case, let
* <b><i>k</i></b> be the smallest nonnegative integer less than
* length such that <code>src[srcOffset+</code><i>k</i><code>]</code>
* cannot be converted to the component type of the destination
* array; when the exception is thrown, source array components from
* positions <code>srcOffset</code> through
* <code>srcOffset+</code><i>k</i><code>-1</code>
* will already have been copied to destination array positions
* <code>dstOffset</code> through
* <code>dstOffset+</code><i>k</I><code>-1</code> and no other
* positions of the destination array will have been modified.
* (Because of the restrictions already itemized, this
* paragraph effectively applies only to the situation where both
* arrays have component types that are reference types.)
*
* @param src the source array.
* @param src_position start position in the source array.
* @param dst the destination array.
* @param dst_position pos start position in the destination data.
* @param length the number of array elements to be copied.
* @exception IndexOutOfBoundsException if copying would cause
* access of data outside array bounds.
* @exception ArrayStoreException if an element in the <code>src</code>
* array could not be stored into the <code>dest</code> array
* because of a type mismatch.
* @exception NullPointerException if either <code>src</code> or
* <code>dst</code> is <code>null</code>.
*/
private void arraycopy( byte[] src, int src_position,
byte[] dst, int dst_position,
int length )
{
System.arraycopy( src, src_position, dst, dst_position, length );
}
/**
* @return the unfinished string
*/
String getUnfinishedString()
{
return unfinishedString;
}
/**
* @return true if current string uses wide characters
*/
boolean isWideChar()
{
return wideChar;
}
}