poi/src/java/org/apache/poi/hssf/record/SSTDeserializer.java

/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2002 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache POI" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache POI", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

package org.apache.poi.hssf.record;

import org.apache.poi.util.BinaryTree;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianConsts;

/**
 * Handles the task of deserializing a SST string.  The two main entry points are
 *
 * @author Glen Stampoultzis (glens at apache.org)
 */
class SSTDeserializer
{

    private BinaryTree strings;
    /** this is the number of characters we expect in the first sub-record in a subsequent continuation record */
    private int continuationExpectedChars;
    /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
    private String unfinishedString;
    /** this is true if the string uses wide characters */
    private boolean wideChar;
    /** this is true if the string is a rich text string */
    private boolean richText;
    /** this is true if the string is a far east string or some other wierd string */
    private boolean extendedText;
    /** Number of formatting runs in this rich text field */
    private short runCount;
    /** Number of characters in current string */
    private int charCount;
    private int extensionLength;


    public SSTDeserializer( BinaryTree strings )
    {
        this.strings = strings;
        initVars();
    }

    private void initVars()
    {
        runCount = 0;
        continuationExpectedChars = 0;
        unfinishedString = "";
//        bytesInCurrentSegment = 0;
//        stringDataOffset = 0;
        wideChar = false;
        richText = false;
        extendedText = false;
    }

    /**
     * This is the starting point where strings are constructed.  Note that
     * strings may span across multiple continuations. Read the SST record
     * carefully before beginning to hack.
     */
    public void manufactureStrings( final byte[] data, final int initialOffset, short dataSize )
    {
        initVars();

        int offset = initialOffset;
        while ( ( offset - initialOffset ) < dataSize )
        {
            int remaining = dataSize - offset + initialOffset;

            if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
            {
                throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
            }
            if ( remaining == LittleEndianConsts.SHORT_SIZE )
            {
                setContinuationExpectedChars( LittleEndian.getUShort( data, offset ) );
                unfinishedString = "";
                break;
            }
            charCount = LittleEndian.getUShort( data, offset );
            readStringHeader( data, offset );
            boolean stringContinuesOverContinuation = remaining < totalStringSize();
            if ( stringContinuesOverContinuation )
            {
                int remainingBytes = ( initialOffset + dataSize ) - offset - stringHeaderOverhead();
                setContinuationExpectedChars( charCount - calculateCharCount( remainingBytes ) );
                charCount -= getContinuationExpectedChars();
            }
            else
            {
                setContinuationExpectedChars( 0 );
            }
            processString( data, offset, charCount );
            offset += totalStringSize();
            if ( getContinuationExpectedChars() != 0 )
            {
                break;
            }
        }
    }

//    private void dump( final byte[] data, int offset, int length )
//    {
//        try
//        {
//            System.out.println( "------------------- SST DUMP -------------------------" );
//            HexDump.dump( (byte[]) data, offset, System.out, offset, length );
//        }
//        catch ( IOException e )
//        {
//        }
//        catch ( ArrayIndexOutOfBoundsException e )
//        {
//        }
//        catch ( IllegalArgumentException e )
//        {
//        }
//    }

    /**
     * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
     * plain string etc) and calculates the length and offset for the string.
     *
     */
    private void readStringHeader( final byte[] data, final int index )
    {

        byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];

        wideChar = ( optionFlag & 1 ) == 1;
        extendedText = ( optionFlag & 4 ) == 4;
        richText = ( optionFlag & 8 ) == 8;
        runCount = 0;
        if ( richText )
        {
            runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD );
        }
        extensionLength = 0;
        if ( extendedText )
        {
            extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD
                    + (richText ? LittleEndianConsts.SHORT_SIZE : 0) );
        }

    }


    /**
     * Reads a string or the first part of a string.
     *
     * @param characters the number of characters to write.
     *
     * @return the number of bytes written.
     */
    private int processString( final byte[] data, final int dataIndex, final int characters )
    {

        // length is the length we store it as.  not the length that is read.
        int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters );
        byte[] unicodeStringBuffer = new byte[length];

        int offset = 0;

        // Set the length in characters
        LittleEndian.putUShort( unicodeStringBuffer, offset, characters );
        offset += LittleEndianConsts.SHORT_SIZE;
        // Set the option flags
        unicodeStringBuffer[offset] = data[dataIndex + offset];
        // Copy in the string data
        int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD;
        arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead );
        // Create the unicode string
        UnicodeString string = new UnicodeString( UnicodeString.sid,
                (short) unicodeStringBuffer.length,
                unicodeStringBuffer );

        if ( isStringFinished() )
        {
            Integer integer = new Integer( strings.size() );
            addToStringTable( strings, integer, string );
        }
        else
        {
            unfinishedString = string.getString();
        }

        return bytesRead;
    }

    private boolean isStringFinished()
    {
        return getContinuationExpectedChars() == 0;
    }

    /**
     * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
     * we end up getting duplicate strings.  To get around this I'm doing two things: 1. Converting rich
     * text to normal text and 2. If there's a duplicate I'm adding a space onto the end.  Sneaky perhaps
     * but it gets the job done until we can handle this a little better.
     */
    static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string )
    {

        if ( string.isRichText() )
            string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) );
        if ( string.isExtendedText() )
            string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) );

        boolean added = false;
        while ( added == false )
        {
            try
            {
                strings.put( integer, string );
                added = true;
            }
            catch ( Exception ignore )
            {
                string.setString( string.getString() + " " );
            }
        }

    }


    private int calculateCharCount( final int byte_count )
    {
        return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
    }

    /**
     * Process a Continue record. A Continue record for an SST record
     * contains the same kind of data that the SST record contains,
     * with the following exceptions:
     * <P>
     * <OL>
     * <LI>The string counts at the beginning of the SST record are
     *     not in the Continue record
     * <LI>The first string in the Continue record might NOT begin
     *     with a size. If the last string in the previous record is
     *     continued in this record, the size is determined by that
     *     last string in the previous record; the first string will
     *     begin with a flag byte, followed by the remaining bytes (or
     *     words) of the last string from the previous
     *     record. Otherwise, the first string in the record will
     *     begin with a string length
     * </OL>
     *
     * @param record the Continue record's byte data
     */
    public void processContinueRecord( final byte[] record )
    {
        if ( isStringFinished() )
        {
            initVars();
            manufactureStrings( record, 0, (short) record.length );
        }
        else
        {
            // reset the wide bit because that can change across a continuation. the fact that it's
            // actually rich text doesn't change across continuations even though the rich text
            // may on longer be set in the "new" option flag.  confusing huh?
            wideChar = ( record[0] & 1 ) == 1;

            if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) )
            {
                processEntireContinuation( record );
            }
            else
            {
                readStringRemainder( record );
            }
        }

    }

    /**
     * Reads the remainder string and any subsequent strings from the continuation record.
     *
     * @param record  The entire continuation record data.
     */
    private void readStringRemainder( final byte[] record )
    {
        int stringRemainderSizeInBytes = calculateByteCount( getContinuationExpectedChars() );
//        stringDataOffset = LittleEndianConsts.BYTE_SIZE;
        byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD
                + calculateByteCount( getContinuationExpectedChars() )];

        // write the string length
        LittleEndian.putShort( unicodeStringData, 0, (short) getContinuationExpectedChars() );

        // write the options flag
        unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText );

        // copy the bytes/words making up the string; skipping
        // past all the overhead of the str_data array
        arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData,
                SSTRecord.STRING_MINIMAL_OVERHEAD,
                unicodeStringData.length - SSTRecord.STRING_MINIMAL_OVERHEAD );

        // use special constructor to create the final string
        UnicodeString string = new UnicodeString( UnicodeString.sid,
                (short) unicodeStringData.length, unicodeStringData,
                unfinishedString );
        Integer integer = new Integer( strings.size() );

        addToStringTable( strings, integer, string );

        int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes );
        manufactureStrings( record, newOffset, (short) ( record.length - newOffset ) );
    }

    /**
     * Calculates the size of the string in bytes based on the character width
     */
    private int stringSizeInBytes()
    {
        return calculateByteCount( charCount );
    }

    /**
     * Calculates the size of the string in byes.  This figure includes all the over
     * heads for the string.
     */
    private int totalStringSize()
    {
        return stringSizeInBytes()
                + stringHeaderOverhead()
                + LittleEndianConsts.INT_SIZE * runCount
                + extensionLength;
    }

    private int stringHeaderOverhead()
    {
        return SSTRecord.STRING_MINIMAL_OVERHEAD
                + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 )
                + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 );
    }

    private int offsetForContinuedRecord( int stringRemainderSizeInBytes )
    {
        return stringRemainderSizeInBytes + LittleEndianConsts.BYTE_SIZE
                + runCount * LittleEndianConsts.INT_SIZE + extensionLength;
    }

    private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast )
    {
        return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) );
    }

    /**
     * If the continued record is so long is spans into the next continue then
     * simply suck the remaining string data into the existing <code>unfinishedString</code>.
     *
     * @param record    The data from the continuation record.
     */
    private void processEntireContinuation( final byte[] record )
    {
        // create artificial data to create a UnicodeString
        int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE;
        byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE];

        LittleEndian.putShort( unicodeStringData, (byte) 0, (short) calculateCharCount( dataLengthInBytes ) );
        arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length );
        UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData );

        unfinishedString = unfinishedString + ucs.getString();
        setContinuationExpectedChars( getContinuationExpectedChars() - calculateCharCount( dataLengthInBytes ) );
    }

    private boolean stringSpansContinuation( int continuationSizeInBytes )
    {
        return calculateByteCount( getContinuationExpectedChars() ) > continuationSizeInBytes;
    }

    /**
     * @return the number of characters we expect in the first
     *         sub-record in a subsequent continuation record
     */

    int getContinuationExpectedChars()
    {
        return continuationExpectedChars;
    }

    private void setContinuationExpectedChars( final int count )
    {
        continuationExpectedChars = count;
    }

    private int calculateByteCount( final int character_count )
    {
        return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
    }


    /**
     * Copies an array from the specified source array, beginning at the
     * specified position, to the specified position of the destination array.
     * A subsequence of array components are copied from the source
     * array referenced by <code>src</code> to the destination array
     * referenced by <code>dst</code>. The number of components copied is
     * equal to the <code>length</code> argument. The components at
     * positions <code>srcOffset</code> through
     * <code>srcOffset+length-1</code> in the source array are copied into
     * positions <code>dstOffset</code> through
     * <code>dstOffset+length-1</code>, respectively, of the destination
     * array.
     * <p>
     * If the <code>src</code> and <code>dst</code> arguments refer to the
     * same array object, then the copying is performed as if the
     * components at positions <code>srcOffset</code> through
     * <code>srcOffset+length-1</code> were first copied to a temporary
     * array with <code>length</code> components and then the contents of
     * the temporary array were copied into positions
     * <code>dstOffset</code> through <code>dstOffset+length-1</code> of the
     * destination array.
     * <p>
     * If <code>dst</code> is <code>null</code>, then a
     * <code>NullPointerException</code> is thrown.
     * <p>
     * If <code>src</code> is <code>null</code>, then a
     * <code>NullPointerException</code> is thrown and the destination
     * array is not modified.
     * <p>
     * Otherwise, if any of the following is true, an
     * <code>ArrayStoreException</code> is thrown and the destination is
     * not modified:
     * <ul>
     * <li>The <code>src</code> argument refers to an object that is not an
     *     array.
     * <li>The <code>dst</code> argument refers to an object that is not an
     *     array.
     * <li>The <code>src</code> argument and <code>dst</code> argument refer to
     *     arrays whose component types are different primitive types.
     * <li>The <code>src</code> argument refers to an array with a primitive
     *     component type and the <code>dst</code> argument refers to an array
     *     with a reference component type.
     * <li>The <code>src</code> argument refers to an array with a reference
     *     component type and the <code>dst</code> argument refers to an array
     *     with a primitive component type.
     * </ul>
     * <p>
     * Otherwise, if any of the following is true, an
     * <code>IndexOutOfBoundsException</code> is
     * thrown and the destination is not modified:
     * <ul>
     * <li>The <code>srcOffset</code> argument is negative.
     * <li>The <code>dstOffset</code> argument is negative.
     * <li>The <code>length</code> argument is negative.
     * <li><code>srcOffset+length</code> is greater than
     *     <code>src.length</code>, the length of the source array.
     * <li><code>dstOffset+length</code> is greater than
     *     <code>dst.length</code>, the length of the destination array.
     * </ul>
     * <p>
     * Otherwise, if any actual component of the source array from
     * position <code>srcOffset</code> through
     * <code>srcOffset+length-1</code> cannot be converted to the component
     * type of the destination array by assignment conversion, an
     * <code>ArrayStoreException</code> is thrown. In this case, let
     * <b><i>k</i></b> be the smallest nonnegative integer less than
     * length such that <code>src[srcOffset+</code><i>k</i><code>]</code>
     * cannot be converted to the component type of the destination
     * array; when the exception is thrown, source array components from
     * positions <code>srcOffset</code> through
     * <code>srcOffset+</code><i>k</i><code>-1</code>
     * will already have been copied to destination array positions
     * <code>dstOffset</code> through
     * <code>dstOffset+</code><i>k</I><code>-1</code> and no other
     * positions of the destination array will have been modified.
     * (Because of the restrictions already itemized, this
     * paragraph effectively applies only to the situation where both
     * arrays have component types that are reference types.)
     *
     * @param      src          the source array.
     * @param      src_position start position in the source array.
     * @param      dst          the destination array.
     * @param      dst_position pos   start position in the destination data.
     * @param      length       the number of array elements to be copied.
     * @exception  IndexOutOfBoundsException  if copying would cause
     *               access of data outside array bounds.
     * @exception  ArrayStoreException  if an element in the <code>src</code>
     *               array could not be stored into the <code>dest</code> array
     *               because of a type mismatch.
     * @exception  NullPointerException if either <code>src</code> or
     *               <code>dst</code> is <code>null</code>.
     */
    private void arraycopy( byte[] src, int src_position,
                            byte[] dst, int dst_position,
                            int length )
    {
        System.arraycopy( src, src_position, dst, dst_position, length );
    }

    /**
     * @return the unfinished string
     */
    String getUnfinishedString()
    {
        return unfinishedString;
    }

    /**
     * @return true if current string uses wide characters
     */
    boolean isWideChar()
    {
        return wideChar;
    }


}