diff --git a/src/java/org/apache/poi/hssf/record/common/UnicodeString.java b/src/java/org/apache/poi/hssf/record/common/UnicodeString.java index bd1758e81..b10d28204 100644 --- a/src/java/org/apache/poi/hssf/record/common/UnicodeString.java +++ b/src/java/org/apache/poi/hssf/record/common/UnicodeString.java @@ -26,9 +26,9 @@ import org.apache.poi.hssf.record.RecordInputStream; import org.apache.poi.hssf.record.cont.ContinuableRecordOutput; import org.apache.poi.util.BitField; import org.apache.poi.util.BitFieldFactory; -import org.apache.poi.util.HexDump; import org.apache.poi.util.LittleEndianInput; import org.apache.poi.util.LittleEndianOutput; +import org.apache.poi.util.StringUtil; /** * Title: Unicode String

@@ -42,8 +42,8 @@ public final class UnicodeString implements Comparable { private short field_1_charCount; private byte field_2_optionflags; private String field_3_string; - private List field_4_format_runs; - private byte[] field_5_ext_rst; + private List field_4_format_runs; + private ExtRst field_5_ext_rst; private static final BitField highByte = BitFieldFactory.getInstance(0x1); // 0x2 is reserved private static final BitField extBit = BitFieldFactory.getInstance(0x4); @@ -98,6 +98,225 @@ public final class UnicodeString implements Comparable { out.writeShort(_fontIndex); } } + + // See page 681 + public static class ExtRst implements Comparable { + private short reserved; + + // This is a Phs (see page 881) + private short formattingFontIndex; + private short formattingOptions; + + // This is a RPHSSub (see page 894) + private int numberOfRuns; + private String phoneticText; + + // This is an array of PhRuns (see page 881) + private PhRun[] phRuns; + // Sometimes there's some cruft at the end + private byte[] extraData; + + private void populateEmpty() { + reserved = 1; + phoneticText = ""; + phRuns = new PhRun[0]; + extraData = new byte[0]; + } + + protected ExtRst() { + populateEmpty(); + } + protected ExtRst(LittleEndianInput in, int expectedLength) { + reserved = in.readShort(); + + // Old style detection (Reserved = 0xFF) + if(reserved == -1) { + populateEmpty(); + return; + } + + // Spot corrupt records + if(reserved != 1) { + System.err.println("Warning - ExtRst was has wrong magic marker, expecting 1 but found " + reserved + " - ignoring"); + // Grab all the remaining data, and ignore it + for(int i=0; i 0) { + length2 = 0; + } + if(length1 != length2) { + throw new IllegalStateException( + "The two length fields of the Phonetic Text don't agree! " + + length1 + " vs " + length2 + ); + } + phoneticText = StringUtil.readUnicodeLE(in, length1); + + int runData = stringDataSize - 4 - 6 - (2*phoneticText.length()); + int numRuns = (runData / 6); + phRuns = new PhRun[numRuns]; + for(int i=0; i { return false; } - //Well the format runs are equal as well!, better check the ExtRst data - //Which by the way we dont know how to decode! - if ((field_5_ext_rst == null) && (other.field_5_ext_rst == null)) - return true; - if (((field_5_ext_rst == null) && (other.field_5_ext_rst != null)) || - ((field_5_ext_rst != null) && (other.field_5_ext_rst == null))) - return false; - size = field_5_ext_rst.length; - if (size != field_5_ext_rst.length) - return false; - - //Check individual bytes! - for (int i=0;i { } if (isExtendedText() && (extensionLength > 0)) { - field_5_ext_rst = new byte[extensionLength]; - for (int i=0;i { } - void setExtendedRst(byte[] ext_rst) { - if (ext_rst != null) - field_2_optionflags = extBit.setByte(field_2_optionflags); - else field_2_optionflags = extBit.clearByte(field_2_optionflags); + public ExtRst getExtendedRst() { + return this.field_5_ext_rst; + } + void setExtendedRst(ExtRst ext_rst) { + if (ext_rst != null) { + field_2_optionflags = extBit.setByte(field_2_optionflags); + } else { + field_2_optionflags = extBit.clearByte(field_2_optionflags); + } this.field_5_ext_rst = ext_rst; } @@ -452,12 +674,18 @@ public final class UnicodeString implements Comparable { } } if (field_5_ext_rst != null) { - buffer.append(" .field_5_ext_rst = ").append("\n").append(HexDump.toHex(field_5_ext_rst)).append("\n"); + buffer.append(" .field_5_ext_rst = ").append("\n"); + buffer.append( field_5_ext_rst.toString() ).append("\n"); } buffer.append("[/UNICODESTRING]\n"); return buffer.toString(); } + /** + * Serialises out the String. There are special rules + * about where we can and can't split onto + * Continue records. + */ public void serialize(ContinuableRecordOutput out) { int numberOfRichTextRuns = 0; int extendedDataSize = 0; @@ -465,9 +693,11 @@ public final class UnicodeString implements Comparable { numberOfRichTextRuns = field_4_format_runs.size(); } if (isExtendedText() && field_5_ext_rst != null) { - extendedDataSize = field_5_ext_rst.length; + extendedDataSize = 4 + field_5_ext_rst.getDataSize(); } - + + // Serialise the bulk of the String + // The writeString handles tricky continue stuff for us out.writeString(field_3_string, numberOfRichTextRuns, extendedDataSize); if (numberOfRichTextRuns > 0) { @@ -477,25 +707,13 @@ public final class UnicodeString implements Comparable { if (out.getAvailableSpace() < 4) { out.writeContinue(); } - FormatRun r = field_4_format_runs.get(i); - r.serialize(out); + FormatRun r = field_4_format_runs.get(i); + r.serialize(out); } } if (extendedDataSize > 0) { - // OK ExtRst is actually not documented, so i am going to hope - // that we can actually continue on byte boundaries - - int extPos = 0; - while (true) { - int nBytesToWrite = Math.min(extendedDataSize - extPos, out.getAvailableSpace()); - out.write(field_5_ext_rst, extPos, nBytesToWrite); - extPos += nBytesToWrite; - if (extPos >= extendedDataSize) { - break; - } - out.writeContinue(); - } + field_5_ext_rst.serialize(out); } } @@ -534,7 +752,6 @@ public final class UnicodeString implements Comparable { } //Well the format runs are equal as well!, better check the ExtRst data - //Which by the way we don't know how to decode! if ((field_5_ext_rst == null) && (str.field_5_ext_rst == null)) return 0; if ((field_5_ext_rst == null) && (str.field_5_ext_rst != null)) @@ -542,15 +759,10 @@ public final class UnicodeString implements Comparable { if ((field_5_ext_rst != null) && (str.field_5_ext_rst == null)) return -1; - size = field_5_ext_rst.length; - if (size != field_5_ext_rst.length) - return size - field_5_ext_rst.length; + result = field_5_ext_rst.compareTo(str.field_5_ext_rst); + if (result != 0) + return result; - //Check individual bytes! - for (int i=0;i { str.field_4_format_runs = new ArrayList(); for (FormatRun r : field_4_format_runs) { str.field_4_format_runs.add(new FormatRun(r._character, r._fontIndex)); - } + } } if (field_5_ext_rst != null) { - str.field_5_ext_rst = new byte[field_5_ext_rst.length]; - System.arraycopy(field_5_ext_rst, 0, str.field_5_ext_rst, 0, - field_5_ext_rst.length); + str.field_5_ext_rst = field_5_ext_rst.clone(); } return str; diff --git a/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java b/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java index b171a77a1..80380ac5e 100644 --- a/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java +++ b/src/testcases/org/apache/poi/hssf/record/TestSSTRecordSizeCalculator.java @@ -33,9 +33,8 @@ public final class TestSSTRecordSizeCalculator extends TestCase { private static final int COMPRESSED_PLAIN_STRING_OVERHEAD = 3; private static final int OPTION_FIELD_SIZE = 1; - private final IntMapper strings = new IntMapper(); + private final IntMapper strings = new IntMapper(); - private void confirmSize(int expectedSize) { ContinuableRecordOutput cro = ContinuableRecordOutput.createForCountingOnly(); SSTSerializer ss = new SSTSerializer(strings, 0, 0); diff --git a/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java b/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java index 6ecab71a5..591042d7e 100644 --- a/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java +++ b/src/testcases/org/apache/poi/hssf/record/common/TestUnicodeString.java @@ -17,12 +17,19 @@ package org.apache.poi.hssf.record.common; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + import junit.framework.TestCase; import org.apache.poi.hssf.record.ContinueRecord; import org.apache.poi.hssf.record.RecordInputStream; import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.record.common.UnicodeString.ExtRst; +import org.apache.poi.hssf.record.common.UnicodeString.FormatRun; import org.apache.poi.hssf.record.cont.ContinuableRecordOutput; +import org.apache.poi.util.LittleEndianInputStream; +import org.apache.poi.util.LittleEndianOutputStream; /** * Tests that {@link UnicodeString} record size calculates correctly. The record size @@ -85,13 +92,23 @@ public final class TestUnicodeString extends TestCase { //Test a compressed small string that has rich text and extended text s.setString("Test"); s.setOptionFlags((byte)0xC); - s.setExtendedRst(new byte[]{(byte)0x1,(byte)0x2,(byte)0x3,(byte)0x4,(byte)0x5}); - confirmSize(26, s); + confirmSize(17, s); + + // Extended phonetics data + // Minimum size is 14 + // Also adds 4 bytes to hold the length + s.setExtendedRst( + new ExtRst() + ); + confirmSize(35, s); //Test a uncompressed small string that has rich text and extended text s.setString(STR_16_BIT); s.setOptionFlags((byte)0xD); - confirmSize(30, s); + confirmSize(39, s); + + s.setExtendedRst(null); + confirmSize(21, s); } public void testPerfectStringSize() { @@ -144,6 +161,146 @@ public final class TestUnicodeString extends TestCase { UnicodeString s = makeUnicodeString(strSize); confirmSize(MAX_DATA_SIZE*2, s); } + + public void testFormatRun() throws Exception { + FormatRun fr = new FormatRun((short)4, (short)0x15c); + assertEquals(4, fr.getCharacterPos()); + assertEquals(0x15c, fr.getFontIndex()); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + LittleEndianOutputStream out = new LittleEndianOutputStream(baos); + + fr.serialize(out); + + byte[] b = baos.toByteArray(); + assertEquals(4, b.length); + assertEquals(4, b[0]); + assertEquals(0, b[1]); + assertEquals(0x5c, b[2]); + assertEquals(0x01, b[3]); + + LittleEndianInputStream inp = new LittleEndianInputStream( + new ByteArrayInputStream(b) + ); + fr = new FormatRun(inp); + assertEquals(4, fr.getCharacterPos()); + assertEquals(0x15c, fr.getFontIndex()); + } + + public void testExtRstFromEmpty() throws Exception { + ExtRst ext = new ExtRst(); + + assertEquals(0, ext.getNumberOfRuns()); + assertEquals(0, ext.getFormattingFontIndex()); + assertEquals(0, ext.getFormattingOptions()); + assertEquals("", ext.getPhoneticText()); + assertEquals(0, ext.getPhRuns().length); + assertEquals(10, ext.getDataSize()); // Excludes 4 byte header + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + LittleEndianOutputStream out = new LittleEndianOutputStream(baos); + ContinuableRecordOutput cout = new ContinuableRecordOutput(out, 0xffff); + + ext.serialize(cout); + cout.writeContinue(); + + byte[] b = baos.toByteArray(); + assertEquals(20, b.length); + + // First 4 bytes from the outputstream + assertEquals(-1, b[0]); + assertEquals(-1, b[1]); + assertEquals(14, b[2]); + assertEquals(00, b[3]); + + // Reserved + assertEquals(1, b[4]); + assertEquals(0, b[5]); + // Data size + assertEquals(10, b[6]); + assertEquals(00, b[7]); + // Font*2 + assertEquals(0, b[8]); + assertEquals(0, b[9]); + assertEquals(0, b[10]); + assertEquals(0, b[11]); + // 0 Runs + assertEquals(0, b[12]); + assertEquals(0, b[13]); + // Size=0, *2 + assertEquals(0, b[14]); + assertEquals(0, b[15]); + assertEquals(0, b[16]); + assertEquals(0, b[17]); + + // Last 2 bytes from the outputstream + assertEquals(ContinueRecord.sid, b[18]); + assertEquals(0, b[19]); + + + // Load in again and re-test + byte[] data = new byte[14]; + System.arraycopy(b, 4, data, 0, data.length); + LittleEndianInputStream inp = new LittleEndianInputStream( + new ByteArrayInputStream(data) + ); + ext = new ExtRst(inp, data.length); + + assertEquals(0, ext.getNumberOfRuns()); + assertEquals(0, ext.getFormattingFontIndex()); + assertEquals(0, ext.getFormattingOptions()); + assertEquals("", ext.getPhoneticText()); + assertEquals(0, ext.getPhRuns().length); + } + + public void testExtRstFromData() throws Exception { + byte[] data = new byte[] { + 01, 00, 0x0C, 00, + 00, 00, 0x37, 00, + 00, 00, + 00, 00, 00, 00, + 00, 00 // Cruft at the end, as found from real files + }; + assertEquals(16, data.length); + + LittleEndianInputStream inp = new LittleEndianInputStream( + new ByteArrayInputStream(data) + ); + ExtRst ext = new ExtRst(inp, data.length); + assertEquals(0x0c, ext.getDataSize()); // Excludes 4 byte header + + assertEquals(0, ext.getNumberOfRuns()); + assertEquals(0x37, ext.getFormattingOptions()); + assertEquals(0, ext.getFormattingFontIndex()); + assertEquals("", ext.getPhoneticText()); + assertEquals(0, ext.getPhRuns().length); + } + + public void testCorruptExtRstDetection() throws Exception { + byte[] data = new byte[] { + 0x79, 0x79, 0x11, 0x11, + 0x22, 0x22, 0x33, 0x33, + }; + assertEquals(8, data.length); + + LittleEndianInputStream inp = new LittleEndianInputStream( + new ByteArrayInputStream(data) + ); + ExtRst ext = new ExtRst(inp, data.length); + + // Will be empty + assertEquals(ext, new ExtRst()); + + // If written, will be the usual size + assertEquals(10, ext.getDataSize()); // Excludes 4 byte header + + // Is empty + assertEquals(0, ext.getNumberOfRuns()); + assertEquals(0, ext.getFormattingOptions()); + assertEquals(0, ext.getFormattingFontIndex()); + assertEquals("", ext.getPhoneticText()); + assertEquals(0, ext.getPhRuns().length); + } private static UnicodeString makeUnicodeString(String s) { diff --git a/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java b/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java index d96ad7466..337499415 100644 --- a/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java +++ b/src/testcases/org/apache/poi/hssf/usermodel/TestBugs.java @@ -36,6 +36,7 @@ import org.apache.poi.hssf.record.CellValueRecordInterface; import org.apache.poi.hssf.record.EmbeddedObjectRefSubRecord; import org.apache.poi.hssf.record.NameRecord; import org.apache.poi.hssf.record.aggregates.FormulaRecordAggregate; +import org.apache.poi.hssf.record.common.UnicodeString; import org.apache.poi.hssf.record.formula.DeletedArea3DPtg; import org.apache.poi.hssf.record.formula.Ptg; import org.apache.poi.ss.usermodel.*; @@ -1538,12 +1539,37 @@ public final class TestBugs extends BaseTestBugzillaIssues { } /** - * Round trip a file with an unusual ExtRst record + * Round trip a file with an unusual UnicodeString/ExtRst record parts */ - public void test47847() { - HSSFWorkbook wb = openSample("47251.xls"); - assertEquals(1, wb.getNumberOfSheets()); + public void test47847() throws Exception { + HSSFWorkbook wb = openSample("47847.xls"); + assertEquals(3, wb.getNumberOfSheets()); + + // Find the SST record + UnicodeString withExt = wb.getWorkbook().getSSTString(0); + UnicodeString withoutExt = wb.getWorkbook().getSSTString(31); + + assertEquals("O:Alloc:Qty", withExt.getString()); + assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004); + + assertEquals("RT", withoutExt.getString()); + assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000); + + // Something about continues... + + + // Write out and re-read wb = writeOutAndReadBack(wb); - assertEquals(1, wb.getNumberOfSheets()); + assertEquals(3, wb.getNumberOfSheets()); + + // Check it's the same now + withExt = wb.getWorkbook().getSSTString(0); + withoutExt = wb.getWorkbook().getSSTString(31); + + assertEquals("O:Alloc:Qty", withExt.getString()); + assertTrue((withExt.getOptionFlags() & 0x0004) == 0x0004); + + assertEquals("RT", withoutExt.getString()); + assertTrue((withoutExt.getOptionFlags() & 0x0004) == 0x0000); } } diff --git a/test-data/spreadsheet/47847.xls b/test-data/spreadsheet/47847.xls new file mode 100644 index 000000000..4a7a631d4 Binary files /dev/null and b/test-data/spreadsheet/47847.xls differ