Further Excel 4 text extractor support, for TIKA-1490

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-11-30 00:48:17 +00:00
parent 48ab7bf987
commit 553964a455
5 changed files with 155 additions and 10 deletions

View File

@ -22,9 +22,13 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.OldLabelRecord;
import org.apache.poi.hssf.record.OldStringRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.RecordInputStream;
import org.apache.poi.ss.usermodel.Cell;
/**
* A text extractor for very old (pre-OLE2) Excel files,
@ -76,20 +80,44 @@ public class OldExcelExtractor {
ris.nextRecord();
switch (sid) {
case LabelRecord.sid:
// label - 5.63 - TODO Needs codepages
case OldLabelRecord.biff2_sid:
case OldLabelRecord.biff345_sid:
OldLabelRecord lr = new OldLabelRecord(ris);
text.append(lr.getValue());
text.append('\n');
break;
// string - 5.102 - TODO Needs codepages
case OldStringRecord.biff2_sid:
case OldStringRecord.biff345_sid:
OldStringRecord sr = new OldStringRecord(ris);
text.append(sr.getString());
text.append('\n');
break;
// number - 5.71 - TODO Needs format strings
case NumberRecord.sid:
NumberRecord nr = new NumberRecord(ris);
text.append(nr.getValue());
text.append('\n');
break;
/*
case OldFormulaRecord.sid:
FormulaRecord fr = new FormulaRecord(ris);
System.out.println(fr.getCachedResultType());
if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
text.append(fr.getValue());
text.append('\n');
}
*/
case RKRecord.sid:
RKRecord rr = new RKRecord(ris);
text.append(rr.getRKNumber());
text.append('\n');
break;
default:
ris.readFully(new byte[ris.remaining()]);
// text.append(" = " + ris.getSid() + " = \n");
}
// label - 5.63 - TODO Needs codepages
// number - 5.71
// rk - 5.87
// string - 5.102
}
return text.toString();

View File

@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianOutput;
public final class FormulaRecord extends CellRecord {
public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647)
public static final short olderSid = 0x0406; // older biff versions do manage 406!
private static int FIXED_SIZE = 14; // double + short + int
private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);

View File

@ -39,7 +39,7 @@ public final class OldLabelRecord extends Record implements CellValueRecordInter
private short field_3_xf_index; // Biff 3+
private short field_4_string_len;
private byte[] field_5_bytes;
//private XXXXX codepage; // TODO
//private XXXXX codepage; // TODO Implement for this and OldStringRecord
/**
* @param in the RecordInputstream to read the record from

View File

@ -0,0 +1,78 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.record;
/**
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
* formula string results.
*/
public final class OldStringRecord {
public final static short biff2_sid = 0x0007;
public final static short biff345_sid = 0x0207;
private short sid;
private short field_1_string_len;
private byte[] field_2_bytes;
//private XXXXX codepage; // TODO Implement for this and OldLabelRecord
/**
* @param in the RecordInputstream to read the record from
*/
public OldStringRecord(RecordInputStream in) {
sid = in.getSid();
if (in.getSid() == biff2_sid) {
field_1_string_len = (short)in.readUByte();
} else {
field_1_string_len = in.readShort();
}
// Can only decode properly later when you know the codepage
field_2_bytes = new byte[field_1_string_len];
in.read(field_2_bytes, 0, field_1_string_len);
}
public boolean isBiff2() {
return sid == biff2_sid;
}
public short getSid() {
return sid;
}
/**
* @return The string represented by this record.
*/
public String getString()
{
// We really need the codepage here to do this right...
return new String(field_2_bytes);
}
public String toString()
{
StringBuffer buffer = new StringBuffer();
buffer.append("[OLD STRING]\n");
buffer.append(" .string = ")
.append(getString()).append("\n");
buffer.append("[/OLD STRING]\n");
return buffer.toString();
}
}

View File

@ -46,7 +46,45 @@ public final class TestOldExcelExtractor extends TestCase {
// Check we find a few words we expect in there
assertTrue(text, text.contains("Size"));
assertTrue(text, text.contains("Returns"));
// Check we find a few numbers we expect in there
assertTrue(text, text.contains("11"));
assertTrue(text, text.contains("784"));
}
// TODO Rest of the tests
public void testStrings() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
String text = extractor.getText();
// Simple strings
assertTrue(text, text.contains("Table 10 -- Examination Coverage:"));
assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After"));
assertTrue(text, text.contains("Individual income tax returns, total"));
// More complicated strings
assertTrue(text, text.contains("$100,000 or more"));
assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
// TODO Get these quotes working correctly
// assertTrue(text, text.contains("individual income tax return “short forms.”"));
// Formula based strings
// TODO Find some then test
}
public void testFormattedNumbers() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
String text = extractor.getText();
// Simple numbers
assertTrue(text, text.contains("151"));
assertTrue(text, text.contains("784"));
// Numbers which come from formulas
// TODO
// assertTrue(text, text.contains("0.40"));
// assertTrue(text, text.contains("624"));
// Formatted numbers
// TODO
}
}