Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-11-30 00:16:23 +00:00
parent 8bf3ebfcee
commit 48ab7bf987
7 changed files with 320 additions and 4 deletions

View File

@ -39,7 +39,7 @@
# 2. cd build/dist
# 3. ./mvn-deploy.sh
M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
VERSION=@VERSION@
DSTAMP=@DSTAMP@

View File

@ -0,0 +1,97 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.OldLabelRecord;
import org.apache.poi.hssf.record.RecordInputStream;
/**
* A text extractor for very old (pre-OLE2) Excel files,
* such as Excel 4 files.
* <p>
* Returns much (but not all) of the textual content of the file,
* suitable for indexing by something like Apache Lucene, or used
* by Apache Tika, but not really intended for display to the user.
* </p>
*/
public class OldExcelExtractor {
private InputStream input;
private boolean _includeSheetNames = true;
public OldExcelExtractor(InputStream input) {
this.input = input;
}
public OldExcelExtractor(File f) throws IOException {
this.input = new FileInputStream(f);
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" OldExcelExtractor <filename>");
System.exit(1);
}
OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
System.out.println(extractor.getText());
}
/**
* Should sheet names be included? Default is true
*/
public void setIncludeSheetNames(boolean includeSheetNames) {
_includeSheetNames = includeSheetNames;
}
/**
* Retrieves the text contents of the file, as best we can
* for these old file formats
*/
public String getText() {
StringBuffer text = new StringBuffer();
RecordInputStream ris = new RecordInputStream(input);
while (ris.hasNextRecord()) {
int sid = ris.getNextSid();
ris.nextRecord();
switch (sid) {
case LabelRecord.sid:
OldLabelRecord lr = new OldLabelRecord(ris);
text.append(lr.getValue());
text.append('\n');
break;
default:
ris.readFully(new byte[ris.remaining()]);
}
// label - 5.63 - TODO Needs codepages
// number - 5.71
// rk - 5.87
// string - 5.102
}
return text.toString();
}
}

View File

@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger;
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't
* use this (except to read), use LabelSST instead <P>
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
* @author Andrew C. Oliver (acoliver at apache dot org)
* @author Jason Height (jheight at chariot dot net dot au)
* @version 2.0-pre
*
* @see org.apache.poi.hssf.record.LabelSSTRecord
*/
public final class LabelRecord extends Record implements CellValueRecordInterface {

View File

@ -0,0 +1,168 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.record;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for
* strings stored directly in the cell, from the older file formats that
* didn't use {@link LabelSSTRecord}
*/
public final class OldLabelRecord extends Record implements CellValueRecordInterface {
private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class);
public final static short biff2_sid = 0x0004;
public final static short biff345_sid = 0x0204;
private short sid;
private int field_1_row;
private short field_2_column;
private int field_3_cell_attrs; // Biff 2
private short field_3_xf_index; // Biff 3+
private short field_4_string_len;
private byte[] field_5_bytes;
//private XXXXX codepage; // TODO
/**
* @param in the RecordInputstream to read the record from
*/
public OldLabelRecord(RecordInputStream in)
{
sid = in.getSid();
field_1_row = in.readUShort();
field_2_column = in.readShort();
if (in.getSid() == biff2_sid) {
field_3_cell_attrs = in.readUShort() << 8;
field_3_cell_attrs += in.readUByte();
field_4_string_len = (short)in.readUByte();
} else {
field_3_xf_index = in.readShort();
field_4_string_len = in.readShort();
}
// Can only decode properly later when you know the codepage
field_5_bytes = new byte[field_4_string_len];
in.read(field_5_bytes, 0, field_4_string_len);
if (in.remaining() > 0) {
logger.log(POILogger.INFO,
"LabelRecord data remains: " + in.remaining() +
" : " + HexDump.toHex(in.readRemainder())
);
}
}
public boolean isBiff2() {
return sid == biff2_sid;
}
public int getRow()
{
return field_1_row;
}
public short getColumn()
{
return field_2_column;
}
public short getXFIndex()
{
return field_3_xf_index;
}
public int getCellAttrs()
{
return field_3_cell_attrs;
}
/**
* get the number of characters this string contains
* @return number of characters
*/
public short getStringLength()
{
return field_4_string_len;
}
/**
* Get the String of the cell
*/
public String getValue()
{
// We really need the codepage here to do this right...
return new String(field_5_bytes);
}
/**
* Not supported
*/
public int serialize(int offset, byte [] data) {
throw new RecordFormatException("Old Label Records are supported READ ONLY");
}
public int getRecordSize() {
throw new RecordFormatException("Old Label Records are supported READ ONLY");
}
public short getSid()
{
return sid;
}
public String toString()
{
StringBuffer sb = new StringBuffer();
sb.append("[OLD LABEL]\n");
sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n");
sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n");
if (isBiff2()) {
sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n");
} else {
sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n");
}
sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n");
sb.append(" .value = ").append(getValue()).append("\n");
sb.append("[/OLD LABEL]\n");
return sb.toString();
}
/**
* NO-OP!
*/
public void setColumn(short col)
{
}
/**
* NO-OP!
*/
public void setRow(int row)
{
}
/**
* no op!
*/
public void setXFIndex(short xf)
{
}
}

View File

@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
SILENT_EXCLUDED.add("46904.xls");
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
}
@Override

View File

@ -0,0 +1,52 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.InputStream;
import junit.framework.TestCase;
import org.apache.poi.hssf.HSSFTestDataSamples;
/**
* Unit tests for the Excel 4 (and older) text extractor
*/
public final class TestOldExcelExtractor extends TestCase {
private static OldExcelExtractor createExtractor(String sampleFileName) {
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName);
try {
return new OldExcelExtractor(is);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public void testSimple() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
// Check we can call getText without error
String text = extractor.getText();
// Check we find a few words we expect in there
assertTrue(text, text.contains("Size"));
assertTrue(text, text.contains("Returns"));
}
// TODO Rest of the tests
}

Binary file not shown.