Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8bf3ebfcee
commit
48ab7bf987
maven
src
java/org/apache/poi/hssf
testcases/org/apache/poi/hssf
test-data/spreadsheet
@ -39,7 +39,7 @@
|
|||||||
# 2. cd build/dist
|
# 2. cd build/dist
|
||||||
# 3. ./mvn-deploy.sh
|
# 3. ./mvn-deploy.sh
|
||||||
|
|
||||||
M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
|
M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
|
||||||
|
|
||||||
VERSION=@VERSION@
|
VERSION=@VERSION@
|
||||||
DSTAMP=@DSTAMP@
|
DSTAMP=@DSTAMP@
|
||||||
|
@ -0,0 +1,97 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.hssf.record.LabelRecord;
|
||||||
|
import org.apache.poi.hssf.record.OldLabelRecord;
|
||||||
|
import org.apache.poi.hssf.record.RecordInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A text extractor for very old (pre-OLE2) Excel files,
|
||||||
|
* such as Excel 4 files.
|
||||||
|
* <p>
|
||||||
|
* Returns much (but not all) of the textual content of the file,
|
||||||
|
* suitable for indexing by something like Apache Lucene, or used
|
||||||
|
* by Apache Tika, but not really intended for display to the user.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class OldExcelExtractor {
|
||||||
|
private InputStream input;
|
||||||
|
private boolean _includeSheetNames = true;
|
||||||
|
|
||||||
|
public OldExcelExtractor(InputStream input) {
|
||||||
|
this.input = input;
|
||||||
|
}
|
||||||
|
public OldExcelExtractor(File f) throws IOException {
|
||||||
|
this.input = new FileInputStream(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if (args.length < 1) {
|
||||||
|
System.err.println("Use:");
|
||||||
|
System.err.println(" OldExcelExtractor <filename>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should sheet names be included? Default is true
|
||||||
|
*/
|
||||||
|
public void setIncludeSheetNames(boolean includeSheetNames) {
|
||||||
|
_includeSheetNames = includeSheetNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the text contents of the file, as best we can
|
||||||
|
* for these old file formats
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
RecordInputStream ris = new RecordInputStream(input);
|
||||||
|
while (ris.hasNextRecord()) {
|
||||||
|
int sid = ris.getNextSid();
|
||||||
|
ris.nextRecord();
|
||||||
|
|
||||||
|
switch (sid) {
|
||||||
|
case LabelRecord.sid:
|
||||||
|
OldLabelRecord lr = new OldLabelRecord(ris);
|
||||||
|
text.append(lr.getValue());
|
||||||
|
text.append('\n');
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
ris.readFully(new byte[ris.remaining()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// label - 5.63 - TODO Needs codepages
|
||||||
|
// number - 5.71
|
||||||
|
// rk - 5.87
|
||||||
|
// string - 5.102
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger;
|
|||||||
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't
|
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't
|
||||||
* use this (except to read), use LabelSST instead <P>
|
* use this (except to read), use LabelSST instead <P>
|
||||||
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
|
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
|
||||||
* @author Andrew C. Oliver (acoliver at apache dot org)
|
*
|
||||||
* @author Jason Height (jheight at chariot dot net dot au)
|
|
||||||
* @version 2.0-pre
|
|
||||||
* @see org.apache.poi.hssf.record.LabelSSTRecord
|
* @see org.apache.poi.hssf.record.LabelSSTRecord
|
||||||
*/
|
*/
|
||||||
public final class LabelRecord extends Record implements CellValueRecordInterface {
|
public final class LabelRecord extends Record implements CellValueRecordInterface {
|
||||||
|
168
src/java/org/apache/poi/hssf/record/OldLabelRecord.java
Normal file
168
src/java/org/apache/poi/hssf/record/OldLabelRecord.java
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hssf.record;
|
||||||
|
|
||||||
|
import org.apache.poi.util.HexDump;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for
|
||||||
|
* strings stored directly in the cell, from the older file formats that
|
||||||
|
* didn't use {@link LabelSSTRecord}
|
||||||
|
*/
|
||||||
|
public final class OldLabelRecord extends Record implements CellValueRecordInterface {
|
||||||
|
private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class);
|
||||||
|
|
||||||
|
public final static short biff2_sid = 0x0004;
|
||||||
|
public final static short biff345_sid = 0x0204;
|
||||||
|
|
||||||
|
private short sid;
|
||||||
|
private int field_1_row;
|
||||||
|
private short field_2_column;
|
||||||
|
private int field_3_cell_attrs; // Biff 2
|
||||||
|
private short field_3_xf_index; // Biff 3+
|
||||||
|
private short field_4_string_len;
|
||||||
|
private byte[] field_5_bytes;
|
||||||
|
//private XXXXX codepage; // TODO
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param in the RecordInputstream to read the record from
|
||||||
|
*/
|
||||||
|
public OldLabelRecord(RecordInputStream in)
|
||||||
|
{
|
||||||
|
sid = in.getSid();
|
||||||
|
|
||||||
|
field_1_row = in.readUShort();
|
||||||
|
field_2_column = in.readShort();
|
||||||
|
|
||||||
|
if (in.getSid() == biff2_sid) {
|
||||||
|
field_3_cell_attrs = in.readUShort() << 8;
|
||||||
|
field_3_cell_attrs += in.readUByte();
|
||||||
|
field_4_string_len = (short)in.readUByte();
|
||||||
|
} else {
|
||||||
|
field_3_xf_index = in.readShort();
|
||||||
|
field_4_string_len = in.readShort();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Can only decode properly later when you know the codepage
|
||||||
|
field_5_bytes = new byte[field_4_string_len];
|
||||||
|
in.read(field_5_bytes, 0, field_4_string_len);
|
||||||
|
|
||||||
|
if (in.remaining() > 0) {
|
||||||
|
logger.log(POILogger.INFO,
|
||||||
|
"LabelRecord data remains: " + in.remaining() +
|
||||||
|
" : " + HexDump.toHex(in.readRemainder())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBiff2() {
|
||||||
|
return sid == biff2_sid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRow()
|
||||||
|
{
|
||||||
|
return field_1_row;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getColumn()
|
||||||
|
{
|
||||||
|
return field_2_column;
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getXFIndex()
|
||||||
|
{
|
||||||
|
return field_3_xf_index;
|
||||||
|
}
|
||||||
|
public int getCellAttrs()
|
||||||
|
{
|
||||||
|
return field_3_cell_attrs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the number of characters this string contains
|
||||||
|
* @return number of characters
|
||||||
|
*/
|
||||||
|
public short getStringLength()
|
||||||
|
{
|
||||||
|
return field_4_string_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the String of the cell
|
||||||
|
*/
|
||||||
|
public String getValue()
|
||||||
|
{
|
||||||
|
// We really need the codepage here to do this right...
|
||||||
|
return new String(field_5_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Not supported
|
||||||
|
*/
|
||||||
|
public int serialize(int offset, byte [] data) {
|
||||||
|
throw new RecordFormatException("Old Label Records are supported READ ONLY");
|
||||||
|
}
|
||||||
|
public int getRecordSize() {
|
||||||
|
throw new RecordFormatException("Old Label Records are supported READ ONLY");
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getSid()
|
||||||
|
{
|
||||||
|
return sid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString()
|
||||||
|
{
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append("[OLD LABEL]\n");
|
||||||
|
sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n");
|
||||||
|
sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n");
|
||||||
|
if (isBiff2()) {
|
||||||
|
sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n");
|
||||||
|
} else {
|
||||||
|
sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n");
|
||||||
|
}
|
||||||
|
sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n");
|
||||||
|
sb.append(" .value = ").append(getValue()).append("\n");
|
||||||
|
sb.append("[/OLD LABEL]\n");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NO-OP!
|
||||||
|
*/
|
||||||
|
public void setColumn(short col)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NO-OP!
|
||||||
|
*/
|
||||||
|
public void setRow(int row)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* no op!
|
||||||
|
*/
|
||||||
|
public void setXFIndex(short xf)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
|
|||||||
SILENT_EXCLUDED.add("46904.xls");
|
SILENT_EXCLUDED.add("46904.xls");
|
||||||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
||||||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
||||||
|
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -0,0 +1,52 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit tests for the Excel 4 (and older) text extractor
|
||||||
|
*/
|
||||||
|
public final class TestOldExcelExtractor extends TestCase {
|
||||||
|
private static OldExcelExtractor createExtractor(String sampleFileName) {
|
||||||
|
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new OldExcelExtractor(is);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimple() {
|
||||||
|
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||||
|
|
||||||
|
// Check we can call getText without error
|
||||||
|
String text = extractor.getText();
|
||||||
|
|
||||||
|
// Check we find a few words we expect in there
|
||||||
|
assertTrue(text, text.contains("Size"));
|
||||||
|
assertTrue(text, text.contains("Returns"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO Rest of the tests
|
||||||
|
}
|
BIN
test-data/spreadsheet/testEXCEL_4.xls
Normal file
BIN
test-data/spreadsheet/testEXCEL_4.xls
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user