/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache POI" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache POI", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ package org.apache.poi.hssf.record; import org.apache.poi.util.BinaryTree; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndianConsts; import java.util.Iterator; import java.util.List; /** * Title: Static String Table Record *

* Description: This holds all the strings for LabelSSTRecords. *

* REFERENCE: PG 389 Microsoft Excel 97 Developer's Kit (ISBN: * 1-57231-498-2) *

* @author Andrew C. Oliver (acoliver at apache dot org) * @author Marc Johnson (mjohnson at apache dot org) * @author Glen Stampoultzis (glens at apache.org) * @version 2.0-pre * @see org.apache.poi.hssf.record.LabelSSTRecord * @see org.apache.poi.hssf.record.ContinueRecord */ public class SSTRecord extends Record { /** how big can an SST record be? As big as any record can be: 8228 bytes */ static final int MAX_RECORD_SIZE = 8228; /** standard record overhead: two shorts (record id plus data space size)*/ static final int STD_RECORD_OVERHEAD = 2 * LittleEndianConsts.SHORT_SIZE; /** SST overhead: the standard record overhead, plus the number of strings and the number of unique strings -- two ints */ static final int SST_RECORD_OVERHEAD = ( STD_RECORD_OVERHEAD + ( 2 * LittleEndianConsts.INT_SIZE ) ); /** how much data can we stuff into an SST record? That would be _max minus the standard SST record overhead */ static final int MAX_DATA_SPACE = MAX_RECORD_SIZE - SST_RECORD_OVERHEAD; /** overhead for each string includes the string's character count (a short) and the flag describing its characteristics (a byte) */ static final int STRING_MINIMAL_OVERHEAD = LittleEndianConsts.SHORT_SIZE + LittleEndianConsts.BYTE_SIZE; public static final short sid = 0xfc; /** union of strings in the SST and EXTSST */ private int field_1_num_strings; /** according to docs ONLY SST */ private int field_2_num_unique_strings; private BinaryTree field_3_strings; /** this is the number of characters we expect in the first sub-record in a subsequent continuation record */ private int __expected_chars; /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */ private String _unfinished_string; /** this is the total length of the current string being handled */ private int _total_length_bytes; /** this is the offset into a string field of the actual string data */ private int _string_data_offset; /** this is true if the string uses wide characters */ private boolean _wide_char; /** Record lengths for initial SST record and all continue records */ private List _record_lengths = null; /** * default constructor */ public SSTRecord() { field_1_num_strings = 0; field_2_num_unique_strings = 0; field_3_strings = new BinaryTree(); setExpectedChars( 0 ); _unfinished_string = ""; _total_length_bytes = 0; _string_data_offset = 0; _wide_char = false; } /** * Constructs an SST record and sets its fields appropriately. * * @param id must be 0xfc or an exception will be throw upon * validation * @param size the size of the data area of the record * @param data of the record (should not contain sid/len) */ public SSTRecord( final short id, final short size, final byte[] data ) { super( id, size, data ); } /** * Constructs an SST record and sets its fields appropriately. * * @param id must be 0xfc or an exception will be throw upon * validation * @param size the size of the data area of the record * @param data of the record (should not contain sid/len) * @param offset of the record */ public SSTRecord( final short id, final short size, final byte[] data, int offset ) { super( id, size, data, offset ); } /** * Add a string. Determines whether 8-bit encoding can be used, or * whether 16-bit encoding must be used. *

* THIS IS THE PREFERRED METHOD OF ADDING A STRING. IF YOU USE THE * OTHER ,code>addString METHOD AND FORCE 8-BIT ENCODING ON * A STRING THAT SHOULD USE 16-BIT ENCODING, YOU WILL CORRUPT THE * STRING; IF YOU USE THAT METHOD AND FORCE 16-BIT ENCODING, YOU * ARE WASTING SPACE WHEN THE WORKBOOK IS WRITTEN OUT. * * @param string string to be added * * @return the index of that string in the table */ public int addString( final String string ) { int rval; if ( string == null ) { rval = addString( "", false ); } else { // scan for characters greater than 255 ... if any are // present, we have to use 16-bit encoding. Otherwise, we // can use 8-bit encoding boolean useUTF16 = false; int strlen = string.length(); for ( int j = 0; j < strlen; j++ ) { if ( string.charAt( j ) > 255 ) { useUTF16 = true; break; } } rval = addString( string, useUTF16 ); } return rval; } /** * Add a string and assert the encoding (8-bit or 16-bit) to be * used. *

* USE THIS METHOD AT YOUR OWN RISK. IF YOU FORCE 8-BIT ENCODING, * YOU MAY CORRUPT YOUR STRING. IF YOU FORCE 16-BIT ENCODING AND * IT ISN'T NECESSARY, YOU WILL WASTE SPACE WHEN THIS RECORD IS * WRITTEN OUT. * * @param string string to be added * @param useUTF16 if true, forces 16-bit encoding. If false, * forces 8-bit encoding * * @return the index of that string in the table */ public int addString( final String string, final boolean useUTF16 ) { field_1_num_strings++; String str = ( string == null ) ? "" : string; int rval = -1; UnicodeString ucs = new UnicodeString(); ucs.setString( str ); ucs.setCharCount( (short) str.length() ); ucs.setOptionFlags( (byte) ( useUTF16 ? 1 : 0 ) ); Integer integer = (Integer) field_3_strings.getKeyForValue( ucs ); if ( integer != null ) { rval = integer.intValue(); } else { // This is a new string -- we didn't see it among the // strings we've already collected rval = field_3_strings.size(); field_2_num_unique_strings++; integer = new Integer( rval ); addToStringTable( integer, ucs ); // field_3_strings.put( integer, ucs ); } return rval; } /** * @return number of strings */ public int getNumStrings() { return field_1_num_strings; } /** * @return number of unique strings */ public int getNumUniqueStrings() { return field_2_num_unique_strings; } /** * USE THIS METHOD AT YOUR OWN PERIL: THE addString * METHODS MANIPULATE THE NUMBER OF STRINGS AS A SIDE EFFECT; YOUR * ATTEMPTS AT MANIPULATING THE STRING COUNT IS LIKELY TO BE VERY * WRONG AND WILL RESULT IN BAD BEHAVIOR WHEN THIS RECORD IS * WRITTEN OUT AND ANOTHER PROCESS ATTEMPTS TO READ THE RECORD * * @param count number of strings * */ public void setNumStrings( final int count ) { field_1_num_strings = count; } /** * USE THIS METHOD AT YOUR OWN PERIL: THE addString * METHODS MANIPULATE THE NUMBER OF UNIQUE STRINGS AS A SIDE * EFFECT; YOUR ATTEMPTS AT MANIPULATING THE UNIQUE STRING COUNT * IS LIKELY TO BE VERY WRONG AND WILL RESULT IN BAD BEHAVIOR WHEN * THIS RECORD IS WRITTEN OUT AND ANOTHER PROCESS ATTEMPTS TO READ * THE RECORD * * @param count number of strings */ public void getNumUniqueStrings( final int count ) { field_2_num_unique_strings = count; } /** * Get a particular string by its index * * @param id index into the array of strings * * @return the desired string */ public String getString( final int id ) { return ( (UnicodeString) field_3_strings.get( new Integer( id ) ) ).getString(); } public boolean isString16bit( final int id ) { UnicodeString unicodeString = ( (UnicodeString) field_3_strings.get( new Integer( id ) ) ); return ( ( unicodeString.getOptionFlags() & 0x01 ) == 1 ); } /** * Return a debugging string representation * * @return string representation */ public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append( "[SST]\n" ); buffer.append( " .numstrings = " ) .append( Integer.toHexString( getNumStrings() ) ).append( "\n" ); buffer.append( " .uniquestrings = " ) .append( Integer.toHexString( getNumUniqueStrings() ) ).append( "\n" ); for ( int k = 0; k < field_3_strings.size(); k++ ) { buffer.append( " .string_" + k + " = " ) .append( ( (UnicodeString) field_3_strings .get( new Integer( k ) ) ).toString() ).append( "\n" ); } buffer.append( "[/SST]\n" ); return buffer.toString(); } /** * Process a Continue record. A Continue record for an SST record * contains the same kind of data that the SST record contains, * with the following exceptions: *

The string counts at the beginning of the SST record are * not in the Continue record *
The first string in the Continue record might NOT begin * with a size. If the last string in the previous record is * continued in this record, the size is determined by that * last string in the previous record; the first string will * begin with a flag byte, followed by the remaining bytes (or * words) of the last string from the previous * record. Otherwise, the first string in the record will * begin with a string length *

* * @param record the Continue record's byte data */ public void processContinueRecord( final byte[] record ) { if ( getExpectedChars() == 0 ) { _unfinished_string = ""; _total_length_bytes = 0; _string_data_offset = 0; _wide_char = false; manufactureStrings( record, 0, (short) record.length ); } else { int data_length = record.length - LittleEndianConsts.BYTE_SIZE; if ( calculateByteCount( getExpectedChars() ) > data_length ) { // create artificial data to create a UnicodeString byte[] input = new byte[record.length + LittleEndianConsts.SHORT_SIZE]; short size = (short) ( ( ( record[0] & 1 ) == 1 ) ? ( data_length / LittleEndianConsts.SHORT_SIZE ) : ( data_length / LittleEndianConsts.BYTE_SIZE ) ); LittleEndian.putShort( input, (byte) 0, size ); System.arraycopy( record, 0, input, LittleEndianConsts.SHORT_SIZE, record.length ); UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) input.length, input ); _unfinished_string = _unfinished_string + ucs.getString(); setExpectedChars( getExpectedChars() - size ); } else { setupStringParameters( record, -LittleEndianConsts.SHORT_SIZE, getExpectedChars() ); byte[] str_data = new byte[_total_length_bytes]; int length = STRING_MINIMAL_OVERHEAD + ( calculateByteCount( getExpectedChars() ) ); byte[] bstring = new byte[length]; // Copy data from the record into the string // buffer. Copy skips the length of a short in the // string buffer, to leave room for the string length. System.arraycopy( record, 0, str_data, LittleEndianConsts.SHORT_SIZE, str_data.length - LittleEndianConsts.SHORT_SIZE ); // write the string length LittleEndian.putShort( bstring, 0, (short) getExpectedChars() ); // write the options flag bstring[LittleEndianConsts.SHORT_SIZE] = str_data[LittleEndianConsts.SHORT_SIZE]; // copy the bytes/words making up the string; skipping // past all the overhead of the str_data array System.arraycopy( str_data, _string_data_offset, bstring, STRING_MINIMAL_OVERHEAD, bstring.length - STRING_MINIMAL_OVERHEAD ); // use special constructor to create the final string UnicodeString string = new UnicodeString( UnicodeString.sid, (short) bstring.length, bstring, _unfinished_string ); Integer integer = new Integer( field_3_strings.size() ); // field_3_strings.put( integer, string ); addToStringTable( integer, string ); manufactureStrings( record, _total_length_bytes - LittleEndianConsts .SHORT_SIZE, (short) record.length ); } } } /** * @return sid */ public short getSid() { return sid; } /** * @return hashcode */ public int hashCode() { return field_2_num_unique_strings; } public boolean equals( Object o ) { if ( ( o == null ) || ( o.getClass() != this.getClass() ) ) { return false; } SSTRecord other = (SSTRecord) o; return ( ( field_1_num_strings == other .field_1_num_strings ) && ( field_2_num_unique_strings == other .field_2_num_unique_strings ) && field_3_strings .equals( other.field_3_strings ) ); } /** * validate SID * * @param id the alleged SID * * @exception RecordFormatException if validation fails */ protected void validateSid( final short id ) throws RecordFormatException { if ( id != sid ) { throw new RecordFormatException( "NOT An SST RECORD" ); } } /** * Fill the fields from the data *

* The data consists of sets of string data. This string data is * arranged as follows: *

* * short string_length; // length of string data * byte string_flag; // flag specifying special string * // handling * short run_count; // optional count of formatting runs * int extend_length; // optional extension length * char[] string_data; // string data, can be byte[] or * // short[] (length of array is * // string_length) * int[] formatting_runs; // optional formatting runs (length of * // array is run_count) * byte[] extension; // optional extension (length of array * // is extend_length) * *

* The string_flag is bit mapped as follows: *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Bit number Meaning if 0 Meaning if 1
0 string_data is byte[] string_data is short[] *
1 Should always be 0 string_flag is defective *
2 extension is not included extension is included *
3 formatting run data is not included formatting run data is included *
4 Should always be 0 string_flag is defective *
5 Should always be 0 string_flag is defective *
6 Should always be 0 string_flag is defective *
7 Should always be 0 string_flag is defective *
*

Bit number	Meaning if 0	Meaning if 1
0	string_data is byte[]	string_data is short[] *
1	Should always be 0	string_flag is defective *
2	extension is not included	extension is included *
3	formatting run data is not included	formatting run data is included *
4	Should always be 0	string_flag is defective *
5	Should always be 0	string_flag is defective *
6	Should always be 0	string_flag is defective *
7	Should always be 0	string_flag is defective *

* We can handle eating the overhead associated with bits 2 or 3 * (or both) being set, but we have no idea what to do with the * associated data. The UnicodeString class can handle the byte[] * vs short[] nature of the actual string data * * @param data raw data * @param size size of the raw data */ protected void fillFields( final byte[] data, final short size, int offset ) { // this method is ALWAYS called after construction -- using // the nontrivial constructor, of course -- so this is where // we initialize our fields field_1_num_strings = LittleEndian.getInt( data, 0 + offset ); field_2_num_unique_strings = LittleEndian.getInt( data, 4 + offset ); field_3_strings = new BinaryTree(); setExpectedChars( 0 ); _unfinished_string = ""; _total_length_bytes = 0; _string_data_offset = 0; _wide_char = false; manufactureStrings( data, 8 + offset, size ); } /** * @return the number of characters we expect in the first * sub-record in a subsequent continuation record */ int getExpectedChars() { return __expected_chars; } /** * @return an iterator of the strings we hold. All instances are * UnicodeStrings */ Iterator getStrings() { return field_3_strings.values().iterator(); } /** * @return count of the strings we hold. */ int countStrings() { return field_3_strings.size(); } /** * @return the unfinished string */ String getUnfinishedString() { return _unfinished_string; } /** * @return the total length of the current string */ int getTotalLength() { return _total_length_bytes; } /** * @return offset into current string data */ int getStringDataOffset() { return _string_data_offset; } /** * @return true if current string uses wide characters */ boolean isWideChar() { return _wide_char; } private void manufactureStrings( final byte[] data, final int index, short size ) { int offset = index; while ( offset < size ) { int remaining = size - offset; if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) ) { throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" ); } if ( remaining == LittleEndianConsts.SHORT_SIZE ) { setExpectedChars( LittleEndian.getShort( data, offset ) ); _unfinished_string = ""; break; } short char_count = LittleEndian.getShort( data, offset ); setupStringParameters( data, offset, char_count ); if ( remaining < _total_length_bytes ) { setExpectedChars( calculateCharCount( _total_length_bytes - remaining ) ); char_count -= getExpectedChars(); _total_length_bytes = remaining; } else { setExpectedChars( 0 ); } processString( data, offset, char_count ); offset += _total_length_bytes; if ( getExpectedChars() != 0 ) { break; } } } private void setupStringParameters( final byte[] data, final int index, final int char_count ) { byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE]; _wide_char = ( optionFlag & 1 ) == 1; boolean extended = ( optionFlag & 4 ) == 4; boolean rich_text = ( optionFlag & 8 ) == 8; _total_length_bytes = STRING_MINIMAL_OVERHEAD + calculateByteCount( char_count ); _string_data_offset = STRING_MINIMAL_OVERHEAD; if ( rich_text ) { short run_count = LittleEndian.getShort( data, index + _string_data_offset ); _string_data_offset += LittleEndianConsts.SHORT_SIZE; _total_length_bytes += LittleEndianConsts.SHORT_SIZE + ( LittleEndianConsts.INT_SIZE * run_count ); } if ( extended ) { int extension_length = LittleEndian.getInt( data, index + _string_data_offset ); _string_data_offset += LittleEndianConsts.INT_SIZE; _total_length_bytes += LittleEndianConsts.INT_SIZE + extension_length; } } private void processString( final byte[] data, final int index, final short char_count ) { byte[] str_data = new byte[_total_length_bytes]; int length = STRING_MINIMAL_OVERHEAD + calculateByteCount( char_count ); byte[] bstring = new byte[length]; System.arraycopy( data, index, str_data, 0, str_data.length ); int offset = 0; LittleEndian.putShort( bstring, offset, char_count ); offset += LittleEndianConsts.SHORT_SIZE; bstring[offset] = str_data[offset]; System.out.println( "_string_data_offset = " + _string_data_offset ); System.arraycopy( str_data, _string_data_offset, bstring, STRING_MINIMAL_OVERHEAD, bstring.length - STRING_MINIMAL_OVERHEAD ); UnicodeString string = new UnicodeString( UnicodeString.sid, (short) bstring.length, bstring ); if ( getExpectedChars() != 0 ) { _unfinished_string = string.getString(); } else { Integer integer = new Integer( field_3_strings.size() ); addToStringTable( integer, string ); } } /** * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly * we end up getting duplicate strings. To get around this I'm doing do things: 1. Converting rich * text to normal text and 2. If there's a duplicate I'm adding a space onto the end. Sneaky perhaps * but it gets the job done until we can handle this a little better. */ private void addToStringTable( Integer integer, UnicodeString string ) { if (string.isRichText()) string.setOptionFlags( (byte)(string.getOptionFlags() & (~8) ) ); boolean added = false; while (added == false) { try { field_3_strings.put( integer, string ); added = true; } catch( Exception ignore ) { string.setString( string.getString() + " " ); } } } private void setExpectedChars( final int count ) { __expected_chars = count; } private int calculateByteCount( final int character_count ) { return character_count * ( _wide_char ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); } private int calculateCharCount( final int byte_count ) { return byte_count / ( _wide_char ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); } /** * called by the class that is responsible for writing this sucker. * Subclasses should implement this so that their data is passed back in a * byte array. * * @return byte array containing instance data */ public int serialize( int offset, byte[] data ) { SSTSerializer serializer = new SSTSerializer( _record_lengths, field_3_strings, getNumStrings(), getNumUniqueStrings() ); return serializer.serialize( offset, data ); } // we can probably simplify this later...this calculates the size // w/o serializing but still is a bit slow public int getRecordSize() { SSTSerializer serializer = new SSTSerializer( _record_lengths, field_3_strings, getNumStrings(), getNumUniqueStrings() ); return serializer.getRecordSize(); } }