Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642561 13f79535-47bb-0310-9956-ffa450edef68
2014-11-30 16:21:39 +00:00 · 2014-11-30 16:21:39 +00:00 · be469c985f
commit be469c985f
parent 544b21405c
5 changed files with 53 additions and 19 deletions
--- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.InputStream;

 import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.CodepageRecord;
 import org.apache.poi.hssf.record.FormulaRecord;
 import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.OldFormulaRecord;
@ -110,6 +111,8 @@ public class OldExcelExtractor {
     *  for these old file formats
     */
    public String getText() {
+        StringBuffer text = new StringBuffer();
+        
        // Work out what version we're dealing with
        int bofSid = ris.getNextSid();
        switch (bofSid) {
@ -128,8 +131,10 @@ public class OldExcelExtractor {
            default:
                throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid); 
        }
+        
+        // To track formats and encodings
+        CodepageRecord codepage = null;

-        StringBuffer text = new StringBuffer();
        while (ris.hasNextRecord()) {
            int sid = ris.getNextSid();
            ris.nextRecord();
@ -139,6 +144,7 @@ public class OldExcelExtractor {
                case OldLabelRecord.biff2_sid:
                case OldLabelRecord.biff345_sid:
                    OldLabelRecord lr = new OldLabelRecord(ris);
+                    lr.setCodePage(codepage);
                    text.append(lr.getValue());
                    text.append('\n');
                    break;
@ -146,6 +152,7 @@ public class OldExcelExtractor {
                case OldStringRecord.biff2_sid:
                case OldStringRecord.biff345_sid:
                    OldStringRecord sr = new OldStringRecord(ris);
+                    sr.setCodePage(codepage);
                    text.append(sr.getString());
                    text.append('\n');
                    break;
@ -175,6 +182,10 @@ public class OldExcelExtractor {
                    handleNumericCell(text, rr.getRKNumber());
                    break;
                    
+                case CodepageRecord.sid:
+                    codepage = new CodepageRecord(ris);
+                    break;
+                    
                default:
                    ris.readFully(new byte[ris.remaining()]);
            }
--- a/src/java/org/apache/poi/hssf/record/CodepageRecord.java
+++ b/src/java/org/apache/poi/hssf/record/CodepageRecord.java
@ -19,13 +19,15 @@

 package org.apache.poi.hssf.record;

+import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.LittleEndianOutput;

 /**
- * Title: Codepage Record<P>
- * Description:  the default characterset. for the workbook<P>
- * REFERENCE:  PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
- * @author Andrew C. Oliver (acoliver at apache dot org)
+ * Title: Codepage Record
+ * <p>Description:  the default characterset. for the workbook</p>
+ * <p>REFERENCE:  PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)</p>
+ * <p>Use {@link CodePageUtil} to turn these values into Java code pages
+ *  to encode/decode strings.</p>
 * @version 2.0-pre
 */

@ -36,11 +38,10 @@ public final class CodepageRecord
    private short             field_1_codepage;   // = 0;

    /**
-     * the likely correct value for CODEPAGE (at least for US versions).  We could use
-     * some help with international versions (which we do not have access to documentation
-     * for)
+     * Excel 97+ (Biff 8) should always store strings as UTF-16LE or
+     *  compressed versions of that. As such, this should always be
+     *  0x4b0 = UTF_16, except for files coming from older versions.
     */
-
    public final static short CODEPAGE = ( short ) 0x4b0;

    public CodepageRecord()
--- a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
+++ b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
@ -32,9 +32,9 @@ public final class OldLabelRecord extends OldCellRecord {
    public final static short biff2_sid = 0x0004;
    public final static short biff345_sid = 0x0204;

-    private short     field_4_string_len;
-    private byte[]    field_5_bytes;
-    //private XXXXX   codepage; // TODO Implement for this and OldStringRecord
+    private short          field_4_string_len;
+    private byte[]         field_5_bytes;
+    private CodepageRecord codepage;

    /**
     * @param in the RecordInputstream to read the record from
@ -61,6 +61,10 @@ public final class OldLabelRecord extends OldCellRecord {
        }
    }

+    public void setCodePage(CodepageRecord codepage) {
+        this.codepage = codepage;
+    }
+    
    /**
     * get the number of characters this string contains
     * @return number of characters
@ -75,8 +79,7 @@ public final class OldLabelRecord extends OldCellRecord {
     */
    public String getValue()
    {
-        // We really need the codepage here to do this right...
-        return new String(field_5_bytes);
+        return OldStringRecord.getString(field_5_bytes, codepage);
    }

    /**
--- a/src/java/org/apache/poi/hssf/record/OldStringRecord.java
+++ b/src/java/org/apache/poi/hssf/record/OldStringRecord.java
@ -17,6 +17,10 @@

 package org.apache.poi.hssf.record;

+import java.io.UnsupportedEncodingException;
+
+import org.apache.poi.util.CodePageUtil;
+

 /**
 * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for 
@ -29,7 +33,7 @@ public final class OldStringRecord {
    private short             sid;
    private short             field_1_string_len;
    private byte[]            field_2_bytes;
-    //private XXXXX           codepage; // TODO Implement for this and OldLabelRecord
+    private CodepageRecord    codepage;

    /**
     * @param in the RecordInputstream to read the record from
@ -55,14 +59,30 @@ public final class OldStringRecord {
    public short getSid() {
        return sid;
    }
+    
+    public void setCodePage(CodepageRecord codepage) {
+        this.codepage = codepage;
+    }

    /**
     * @return The string represented by this record.
     */
    public String getString()
    {
-        // We really need the codepage here to do this right...
-        return new String(field_2_bytes);
+        return getString(field_2_bytes, codepage);
+    }
+    
+    protected static String getString(byte[] data, CodepageRecord codepage) {
+        int cp = CodePageUtil.CP_ISO_8859_1;
+        if (codepage != null) {
+            cp = codepage.getCodepage();
+        }
+        try {
+            return CodePageUtil.getStringFromCodePage(data, cp);
+        } catch (UnsupportedEncodingException uee) {
+            // Hope the system default is ok...
+            return new String(data);
+        }
    }

    public String toString()
--- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
@ -81,8 +81,7 @@ public final class TestOldExcelExtractor extends POITestCase {
        // More complicated strings
        assertContains(text, "$100,000 or more");
        assertContains(text, "S corporation returns, Form 1120S [10,15]");
-        // TODO Get these quotes working correctly
-//        assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
+        assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
        
        // Formula based strings
        // TODO Find some then test