poi/src/java/org/apache/poi/hssf/record/SSTDeserializer.java

package org.apache.poi.hssf.record;

import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.BinaryTree;
import org.apache.poi.util.HexDump;

import java.io.IOException;

class SSTDeserializer
{

    private BinaryTree strings;
    /** this is the number of characters we expect in the first sub-record in a subsequent continuation record */
    private int continuationExpectedChars;
    /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
    private String unfinishedString;
    /** this is the total length of the current string being handled */
    private int totalLengthBytes;
    /** this is the offset into a string field of the actual string data */
    private int stringDataOffset;
    /** this is true if the string uses wide characters */
    private boolean wideChar;


    public SSTDeserializer(BinaryTree strings)
    {
        this.strings = strings;
        setExpectedChars( 0 );
        unfinishedString = "";
        totalLengthBytes = 0;
        stringDataOffset = 0;
        wideChar = false;
    }

    /**
     * This is the starting point where strings are constructed.  Note that
     * strings may span across multiple continuations. Read the SST record
     * carefully before beginning to hack.
     */
    public void manufactureStrings( final byte[] data, final int index,
                                     short size )
    {
        int offset = index;

        while ( offset < size )
        {
            int remaining = size - offset;

            if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
            {
                throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
            }
            if ( remaining == LittleEndianConsts.SHORT_SIZE )
            {
                setExpectedChars( LittleEndian.getUShort( data, offset ) );
                unfinishedString = "";
                break;
            }
            short charCount = LittleEndian.getShort( data, offset );

            setupStringParameters( data, offset, charCount );
            if ( remaining < totalLengthBytes )
            {
                setExpectedChars( calculateCharCount( totalLengthBytes - remaining ) );
                charCount -= getExpectedChars();
                totalLengthBytes = remaining;
            }
            else
            {
                setExpectedChars( 0 );
            }
            processString( data, offset, charCount );
            offset += totalLengthBytes;
            if ( getExpectedChars() != 0 )
            {
                break;
            }
        }
    }


    /**
     * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
     * plain string etc) and calculates the length and offset for the string.
     *
     * @param data
     * @param index
     * @param char_count
     */
    private void setupStringParameters( final byte[] data, final int index,
                                        final int char_count )
    {
        byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];

        wideChar = ( optionFlag & 1 ) == 1;
        boolean farEast = ( optionFlag & 4 ) == 4;
        boolean richText = ( optionFlag & 8 ) == 8;

        totalLengthBytes = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( char_count );
        stringDataOffset = SSTRecord.STRING_MINIMAL_OVERHEAD;
        if ( richText )
        {
            short run_count = LittleEndian.getShort( data, index + stringDataOffset );

            stringDataOffset += LittleEndianConsts.SHORT_SIZE;
            totalLengthBytes += LittleEndianConsts.SHORT_SIZE + ( LittleEndianConsts.INT_SIZE * run_count );
        }
        if ( farEast )
        {
            int extension_length = LittleEndian.getInt( data, index + stringDataOffset );

            stringDataOffset += LittleEndianConsts.INT_SIZE;
            totalLengthBytes += LittleEndianConsts.INT_SIZE + extension_length;
        }
    }


    private void processString( final byte[] data, final int index,
                                final short char_count )
    {
        byte[] stringDataBuffer = new byte[totalLengthBytes];
        int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( char_count );
        byte[] bstring = new byte[length];

        System.arraycopy( data, index, stringDataBuffer, 0, stringDataBuffer.length );
        int offset = 0;

        LittleEndian.putShort( bstring, offset, char_count );
        offset += LittleEndianConsts.SHORT_SIZE;
        bstring[offset] = stringDataBuffer[offset];

//        System.out.println( "offset = " + stringDataOffset );
//        System.out.println( "length = " + (bstring.length - STRING_MINIMAL_OVERHEAD) );
//        System.out.println( "src.length = " + str_data.length );
//        try
//        {
//            System.out.println( "----------------------- DUMP -------------------------" );
//            HexDump.dump( stringDataBuffer, (long)stringDataOffset, System.out, 1);
//        }
//        catch ( IOException e )
//        {
//        }
//        catch ( ArrayIndexOutOfBoundsException e )
//        {
//        }
//        catch ( IllegalArgumentException e )
//        {
//        }
        System.arraycopy( stringDataBuffer, stringDataOffset, bstring,
                SSTRecord.STRING_MINIMAL_OVERHEAD,
                bstring.length - SSTRecord.STRING_MINIMAL_OVERHEAD );
        UnicodeString string = new UnicodeString( UnicodeString.sid,
                (short) bstring.length,
                bstring );

        if ( getExpectedChars() != 0 )
        {
            unfinishedString = string.getString();
        }
        else
        {
            Integer integer = new Integer( strings.size() );
            addToStringTable( strings, integer, string );
        }
    }

    /**
     * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
     * we end up getting duplicate strings.  To get around this I'm doing do things: 1. Converting rich
     * text to normal text and 2. If there's a duplicate I'm adding a space onto the end.  Sneaky perhaps
     * but it gets the job done until we can handle this a little better.
     */
    static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string )
    {
        if (string.isRichText())
            string.setOptionFlags( (byte)(string.getOptionFlags() & (~8) ) );

        boolean added = false;
        while (added == false)
        {
            try
            {
                strings.put( integer, string );
                added = true;
            }
            catch( Exception ignore )
            {
                string.setString( string.getString() + " " );
            }
        }
    }


    private int calculateCharCount( final int byte_count )
    {
        return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE
                : LittleEndianConsts.BYTE_SIZE );
    }

    /**
     * Process a Continue record. A Continue record for an SST record
     * contains the same kind of data that the SST record contains,
     * with the following exceptions:
     * <P>
     * <OL>
     * <LI>The string counts at the beginning of the SST record are
     *     not in the Continue record
     * <LI>The first string in the Continue record might NOT begin
     *     with a size. If the last string in the previous record is
     *     continued in this record, the size is determined by that
     *     last string in the previous record; the first string will
     *     begin with a flag byte, followed by the remaining bytes (or
     *     words) of the last string from the previous
     *     record. Otherwise, the first string in the record will
     *     begin with a string length
     * </OL>
     *
     * @param record the Continue record's byte data
     */

    public void processContinueRecord( final byte[] record )
    {
        if ( getExpectedChars() == 0 )
        {
            unfinishedString = "";
            totalLengthBytes = 0;
            stringDataOffset = 0;
            wideChar = false;
            manufactureStrings( record, 0, (short) record.length );
        }
        else
        {
            int data_length = record.length - LittleEndianConsts.BYTE_SIZE;

            if ( calculateByteCount( getExpectedChars() ) > data_length )
            {

                // create artificial data to create a UnicodeString
                byte[] input =
                        new byte[record.length + LittleEndianConsts.SHORT_SIZE];
                short size = (short) ( ( ( record[0] & 1 ) == 1 )
                        ? ( data_length / LittleEndianConsts.SHORT_SIZE )
                        : ( data_length / LittleEndianConsts.BYTE_SIZE ) );

                LittleEndian.putShort( input, (byte) 0, size );
                System.arraycopy( record, 0, input, LittleEndianConsts.SHORT_SIZE, record.length );
                UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) input.length, input );

                unfinishedString = unfinishedString + ucs.getString();
                setExpectedChars( getExpectedChars() - size );
            }
            else
            {
                setupStringParameters( record, -LittleEndianConsts.SHORT_SIZE,
                        getExpectedChars() );
                byte[] str_data = new byte[totalLengthBytes];
                int length = SSTRecord.STRING_MINIMAL_OVERHEAD
                        + ( calculateByteCount( getExpectedChars() ) );
                byte[] bstring = new byte[length];

                // Copy data from the record into the string
                // buffer. Copy skips the length of a short in the
                // string buffer, to leave room for the string length.
                System.arraycopy( record, 0, str_data,
                        LittleEndianConsts.SHORT_SIZE,
                        str_data.length
                        - LittleEndianConsts.SHORT_SIZE );

                // write the string length
                LittleEndian.putShort( bstring, 0,
                        (short) getExpectedChars() );

                // write the options flag
                bstring[LittleEndianConsts.SHORT_SIZE] =
                        str_data[LittleEndianConsts.SHORT_SIZE];

                // copy the bytes/words making up the string; skipping
                // past all the overhead of the str_data array
                System.arraycopy( str_data, stringDataOffset, bstring,
                        SSTRecord.STRING_MINIMAL_OVERHEAD,
                        bstring.length - SSTRecord.STRING_MINIMAL_OVERHEAD );

                // use special constructor to create the final string
                UnicodeString string =
                        new UnicodeString( UnicodeString.sid,
                                (short) bstring.length, bstring,
                                unfinishedString );
                Integer integer = new Integer( strings.size() );

//                field_3_strings.put( integer, string );
                addToStringTable( strings, integer, string );
                manufactureStrings( record, totalLengthBytes - LittleEndianConsts.SHORT_SIZE, (short) record.length );
            }
        }
    }

    /**
     * @return the number of characters we expect in the first
     *         sub-record in a subsequent continuation record
     */

    int getExpectedChars()
    {
        return continuationExpectedChars;
    }

    private void setExpectedChars( final int count )
    {
        continuationExpectedChars = count;
    }

    private int calculateByteCount( final int character_count )
    {
        return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
    }


    /**
     * @return the unfinished string
     */

    String getUnfinishedString()
    {
        return unfinishedString;
    }

    /**
     * @return the total length of the current string
     */

    int getTotalLength()
    {
        return totalLengthBytes;
    }

    /**
     * @return offset into current string data
     */

    int getStringDataOffset()
    {
        return stringDataOffset;
    }

    /**
     * @return true if current string uses wide characters
     */

    boolean isWideChar()
    {
        return wideChar;
    }


}