Further Excel 4 text extractor support, for TIKA-1490

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642492 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-11-30 01:03:24 +00:00
parent 553964a455
commit 738c518474
4 changed files with 127 additions and 9 deletions

View File

@ -24,6 +24,7 @@ import java.io.InputStream;
import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.OldFormulaRecord;
import org.apache.poi.hssf.record.OldLabelRecord; import org.apache.poi.hssf.record.OldLabelRecord;
import org.apache.poi.hssf.record.OldStringRecord; import org.apache.poi.hssf.record.OldStringRecord;
import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.RKRecord;
@ -100,15 +101,15 @@ public class OldExcelExtractor {
text.append(nr.getValue()); text.append(nr.getValue());
text.append('\n'); text.append('\n');
break; break;
/* case OldFormulaRecord.biff2_sid:
case OldFormulaRecord.sid: case OldFormulaRecord.biff3_sid:
FormulaRecord fr = new FormulaRecord(ris); case OldFormulaRecord.biff4_sid:
System.out.println(fr.getCachedResultType()); OldFormulaRecord fr = new OldFormulaRecord(ris);
if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) { // if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
text.append(fr.getValue()); text.append(fr.getValue());
text.append('\n'); text.append('\n');
} // }
*/ break;
case RKRecord.sid: case RKRecord.sid:
RKRecord rr = new RKRecord(ris); RKRecord rr = new RKRecord(ris);
text.append(rr.getRKNumber()); text.append(rr.getRKNumber());

View File

@ -36,7 +36,6 @@ import org.apache.poi.util.LittleEndianOutput;
public final class FormulaRecord extends CellRecord { public final class FormulaRecord extends CellRecord {
public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647) public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647)
public static final short olderSid = 0x0406; // older biff versions do manage 406!
private static int FIXED_SIZE = 14; // double + short + int private static int FIXED_SIZE = 14; // double + short + int
private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001); private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);

View File

@ -0,0 +1,118 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.record;
import org.apache.poi.ss.formula.Formula;
import org.apache.poi.ss.formula.ptg.Ptg;
/**
* Formula Record (0x0006 / 0x0206 / 0x0406) - holds a formula in
* encoded form, along with the value if a number
*/
public final class OldFormulaRecord {
public final static short biff2_sid = 0x0006;
public final static short biff3_sid = 0x0206;
public final static short biff4_sid = 0x0406;
public final static short biff5_sid = 0x0006;
private short sid;
private int field_1_row;
private short field_2_column;
private int field_3_cell_attrs; // Biff 2
private short field_3_xf_index; // Biff 3+
private double field_4_value;
private short field_5_options;
private Formula field_6_parsed_expr;
public OldFormulaRecord(RecordInputStream ris) {
field_1_row = ris.readUShort();
field_2_column = ris.readShort();
if (ris.getSid() == biff2_sid) {
field_3_cell_attrs = ris.readUShort() << 8;
field_3_cell_attrs += ris.readUByte();
} else {
field_3_xf_index = ris.readShort();
}
// TODO Handle special cached values, for Biff 3+
field_4_value = ris.readDouble();
if (ris.getSid() == biff2_sid) {
field_5_options = (short)ris.readUByte();
} else {
field_5_options = ris.readShort();
}
int expression_len = ris.readShort();
int nBytesAvailable = ris.available();
field_6_parsed_expr = Formula.read(expression_len, ris, nBytesAvailable);
}
public int getRow()
{
return field_1_row;
}
public short getColumn()
{
return field_2_column;
}
public short getXFIndex()
{
return field_3_xf_index;
}
public int getCellAttrs()
{
return field_3_cell_attrs;
}
/**
* get the calculated value of the formula
*
* @return calculated value
*/
public double getValue() {
return field_4_value;
}
/**
* get the option flags
*
* @return bitmask
*/
public short getOptions() {
return field_5_options;
}
/**
* @return the formula tokens. never <code>null</code>
*/
public Ptg[] getParsedExpression() {
return field_6_parsed_expr.getTokens();
}
public Formula getFormula() {
return field_6_parsed_expr;
}
public short getSid() {
return sid;
}
}

View File

@ -65,7 +65,7 @@ public final class TestOldExcelExtractor extends TestCase {
assertTrue(text, text.contains("$100,000 or more")); assertTrue(text, text.contains("$100,000 or more"));
assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]")); assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
// TODO Get these quotes working correctly // TODO Get these quotes working correctly
// assertTrue(text, text.contains("individual income tax return “short forms.”")); // assertTrue(text, text.contains("individual income tax return \u201Cshort forms.\u201D"));
// Formula based strings // Formula based strings
// TODO Find some then test // TODO Find some then test