From be469c985ff10a78f0836776d1c2cc4be745b3e0 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 30 Nov 2014 16:21:39 +0000 Subject: [PATCH] Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642561 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hssf/extractor/OldExcelExtractor.java | 13 +++++++++- .../poi/hssf/record/CodepageRecord.java | 17 ++++++------ .../poi/hssf/record/OldLabelRecord.java | 13 ++++++---- .../poi/hssf/record/OldStringRecord.java | 26 ++++++++++++++++--- .../hssf/extractor/TestOldExcelExtractor.java | 3 +-- 5 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java index f1a28479e..e8cd0f63d 100644 --- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; import org.apache.poi.hssf.record.BOFRecord; +import org.apache.poi.hssf.record.CodepageRecord; import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.OldFormulaRecord; @@ -110,6 +111,8 @@ public class OldExcelExtractor { * for these old file formats */ public String getText() { + StringBuffer text = new StringBuffer(); + // Work out what version we're dealing with int bofSid = ris.getNextSid(); switch (bofSid) { @@ -128,8 +131,10 @@ public class OldExcelExtractor { default: throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid); } + + // To track formats and encodings + CodepageRecord codepage = null; - StringBuffer text = new StringBuffer(); while (ris.hasNextRecord()) { int sid = ris.getNextSid(); ris.nextRecord(); @@ -139,6 +144,7 @@ public class OldExcelExtractor { case OldLabelRecord.biff2_sid: case OldLabelRecord.biff345_sid: OldLabelRecord lr = new OldLabelRecord(ris); + lr.setCodePage(codepage); text.append(lr.getValue()); text.append('\n'); break; @@ -146,6 +152,7 @@ public class OldExcelExtractor { case OldStringRecord.biff2_sid: case OldStringRecord.biff345_sid: OldStringRecord sr = new OldStringRecord(ris); + sr.setCodePage(codepage); text.append(sr.getString()); text.append('\n'); break; @@ -175,6 +182,10 @@ public class OldExcelExtractor { handleNumericCell(text, rr.getRKNumber()); break; + case CodepageRecord.sid: + codepage = new CodepageRecord(ris); + break; + default: ris.readFully(new byte[ris.remaining()]); } diff --git a/src/java/org/apache/poi/hssf/record/CodepageRecord.java b/src/java/org/apache/poi/hssf/record/CodepageRecord.java index b317ca336..59f69af61 100644 --- a/src/java/org/apache/poi/hssf/record/CodepageRecord.java +++ b/src/java/org/apache/poi/hssf/record/CodepageRecord.java @@ -19,13 +19,15 @@ package org.apache.poi.hssf.record; +import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.LittleEndianOutput; /** - * Title: Codepage Record

- * Description: the default characterset. for the workbook

- * REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)

- * @author Andrew C. Oliver (acoliver at apache dot org) + * Title: Codepage Record + *

Description: the default characterset. for the workbook

+ *

REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)

+ *

Use {@link CodePageUtil} to turn these values into Java code pages + * to encode/decode strings.

* @version 2.0-pre */ @@ -36,11 +38,10 @@ public final class CodepageRecord private short field_1_codepage; // = 0; /** - * the likely correct value for CODEPAGE (at least for US versions). We could use - * some help with international versions (which we do not have access to documentation - * for) + * Excel 97+ (Biff 8) should always store strings as UTF-16LE or + * compressed versions of that. As such, this should always be + * 0x4b0 = UTF_16, except for files coming from older versions. */ - public final static short CODEPAGE = ( short ) 0x4b0; public CodepageRecord() diff --git a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java index 7c9aaace1..ad6772382 100644 --- a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java +++ b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java @@ -32,9 +32,9 @@ public final class OldLabelRecord extends OldCellRecord { public final static short biff2_sid = 0x0004; public final static short biff345_sid = 0x0204; - private short field_4_string_len; - private byte[] field_5_bytes; - //private XXXXX codepage; // TODO Implement for this and OldStringRecord + private short field_4_string_len; + private byte[] field_5_bytes; + private CodepageRecord codepage; /** * @param in the RecordInputstream to read the record from @@ -61,6 +61,10 @@ public final class OldLabelRecord extends OldCellRecord { } } + public void setCodePage(CodepageRecord codepage) { + this.codepage = codepage; + } + /** * get the number of characters this string contains * @return number of characters @@ -75,8 +79,7 @@ public final class OldLabelRecord extends OldCellRecord { */ public String getValue() { - // We really need the codepage here to do this right... - return new String(field_5_bytes); + return OldStringRecord.getString(field_5_bytes, codepage); } /** diff --git a/src/java/org/apache/poi/hssf/record/OldStringRecord.java b/src/java/org/apache/poi/hssf/record/OldStringRecord.java index 42549e2c2..d4296ba63 100644 --- a/src/java/org/apache/poi/hssf/record/OldStringRecord.java +++ b/src/java/org/apache/poi/hssf/record/OldStringRecord.java @@ -17,6 +17,10 @@ package org.apache.poi.hssf.record; +import java.io.UnsupportedEncodingException; + +import org.apache.poi.util.CodePageUtil; + /** * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for @@ -29,7 +33,7 @@ public final class OldStringRecord { private short sid; private short field_1_string_len; private byte[] field_2_bytes; - //private XXXXX codepage; // TODO Implement for this and OldLabelRecord + private CodepageRecord codepage; /** * @param in the RecordInputstream to read the record from @@ -55,14 +59,30 @@ public final class OldStringRecord { public short getSid() { return sid; } + + public void setCodePage(CodepageRecord codepage) { + this.codepage = codepage; + } /** * @return The string represented by this record. */ public String getString() { - // We really need the codepage here to do this right... - return new String(field_2_bytes); + return getString(field_2_bytes, codepage); + } + + protected static String getString(byte[] data, CodepageRecord codepage) { + int cp = CodePageUtil.CP_ISO_8859_1; + if (codepage != null) { + cp = codepage.getCodepage(); + } + try { + return CodePageUtil.getStringFromCodePage(data, cp); + } catch (UnsupportedEncodingException uee) { + // Hope the system default is ok... + return new String(data); + } } public String toString() diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java index 8742b92dc..d57129265 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java @@ -81,8 +81,7 @@ public final class TestOldExcelExtractor extends POITestCase { // More complicated strings assertContains(text, "$100,000 or more"); assertContains(text, "S corporation returns, Form 1120S [10,15]"); - // TODO Get these quotes working correctly -// assertContains(text, "individual income tax return \u201Cshort forms.\u201D"); + assertContains(text, "individual income tax return \u201Cshort forms.\u201D"); // Formula based strings // TODO Find some then test