Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
41ba513d11
commit
101f66c789
@ -17,6 +17,8 @@
|
|||||||
|
|
||||||
package org.apache.poi.hssf.extractor;
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.Closeable;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord;
|
|||||||
import org.apache.poi.hssf.record.OldStringRecord;
|
import org.apache.poi.hssf.record.OldStringRecord;
|
||||||
import org.apache.poi.hssf.record.RKRecord;
|
import org.apache.poi.hssf.record.RKRecord;
|
||||||
import org.apache.poi.hssf.record.RecordInputStream;
|
import org.apache.poi.hssf.record.RecordInputStream;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.DocumentNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
import org.apache.poi.ss.usermodel.Cell;
|
import org.apache.poi.ss.usermodel.Cell;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A text extractor for very old (pre-OLE2) Excel files,
|
* A text extractor for old Excel files, which are too old for
|
||||||
* such as Excel 4 files.
|
* HSSFWorkbook to handle. This includes Excel 95, and very old
|
||||||
|
* (pre-OLE2) Excel files, such as Excel 4 files.
|
||||||
* <p>
|
* <p>
|
||||||
* Returns much (but not all) of the textual content of the file,
|
* Returns much (but not all) of the textual content of the file,
|
||||||
* suitable for indexing by something like Apache Lucene, or used
|
* suitable for indexing by something like Apache Lucene, or used
|
||||||
@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
|
|||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class OldExcelExtractor {
|
public class OldExcelExtractor {
|
||||||
private InputStream input;
|
private RecordInputStream ris;
|
||||||
|
private Closeable input;
|
||||||
|
|
||||||
public OldExcelExtractor(InputStream input) {
|
public OldExcelExtractor(InputStream input) throws IOException {
|
||||||
this.input = input;
|
BufferedInputStream bstream = new BufferedInputStream(input, 8);
|
||||||
|
if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
|
||||||
|
open(new NPOIFSFileSystem(bstream));
|
||||||
|
} else {
|
||||||
|
open(bstream);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
public OldExcelExtractor(File f) throws IOException {
|
public OldExcelExtractor(File f) throws IOException {
|
||||||
this.input = new FileInputStream(f);
|
InputStream input = new FileInputStream(f);
|
||||||
|
if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
|
||||||
|
open(new NPOIFSFileSystem(f));
|
||||||
|
} else {
|
||||||
|
open(input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
|
||||||
|
open(fs);
|
||||||
|
}
|
||||||
|
public OldExcelExtractor(DirectoryNode directory) throws IOException {
|
||||||
|
open(directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void open(InputStream biffStream) {
|
||||||
|
input = biffStream;
|
||||||
|
ris = new RecordInputStream(biffStream);
|
||||||
|
}
|
||||||
|
private void open(NPOIFSFileSystem fs) throws IOException {
|
||||||
|
input = fs;
|
||||||
|
open(fs.getRoot());
|
||||||
|
}
|
||||||
|
private void open(DirectoryNode directory) throws IOException {
|
||||||
|
DocumentNode book = (DocumentNode)directory.getEntry("Book");
|
||||||
|
if (book == null) {
|
||||||
|
throw new IOException("No Excel 5/95 Book stream found");
|
||||||
|
}
|
||||||
|
|
||||||
|
ris = new RecordInputStream(directory.createDocumentInputStream(book));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
@ -66,7 +106,6 @@ public class OldExcelExtractor {
|
|||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
RecordInputStream ris = new RecordInputStream(input);
|
|
||||||
while (ris.hasNextRecord()) {
|
while (ris.hasNextRecord()) {
|
||||||
int sid = ris.getNextSid();
|
int sid = ris.getNextSid();
|
||||||
ris.nextRecord();
|
ris.nextRecord();
|
||||||
@ -109,6 +148,14 @@ public class OldExcelExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (input != null) {
|
||||||
|
try {
|
||||||
|
input.close();
|
||||||
|
} catch (IOException e) {}
|
||||||
|
input = null;
|
||||||
|
}
|
||||||
|
ris = null;
|
||||||
|
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
|
|||||||
SILENT_EXCLUDED.add("46904.xls");
|
SILENT_EXCLUDED.add("46904.xls");
|
||||||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
||||||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
||||||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
||||||
|
SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5
|
||||||
|
SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -24,7 +24,8 @@ import junit.framework.TestCase;
|
|||||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unit tests for the Excel 4 (and older) text extractor
|
* Unit tests for the Excel 5/95 and Excel 4 (and older) text
|
||||||
|
* extractor
|
||||||
*/
|
*/
|
||||||
public final class TestOldExcelExtractor extends TestCase {
|
public final class TestOldExcelExtractor extends TestCase {
|
||||||
private static OldExcelExtractor createExtractor(String sampleFileName) {
|
private static OldExcelExtractor createExtractor(String sampleFileName) {
|
||||||
@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSimple() {
|
public void testSimpleExcel4() {
|
||||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||||
|
|
||||||
// Check we can call getText without error
|
// Check we can call getText without error
|
||||||
@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||||||
assertTrue(text, text.contains("11"));
|
assertTrue(text, text.contains("11"));
|
||||||
assertTrue(text, text.contains("784"));
|
assertTrue(text, text.contains("784"));
|
||||||
}
|
}
|
||||||
|
public void DISABLEDtestSimpleExcel5() {
|
||||||
|
for (String ver : new String[] {"5", "95"}) {
|
||||||
|
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
|
||||||
|
|
||||||
|
// Check we can call getText without error
|
||||||
|
String text = extractor.getText();
|
||||||
|
|
||||||
|
// Check we find a few words we expect in there
|
||||||
|
assertTrue(text, text.contains("Sample Excel"));
|
||||||
|
assertTrue(text, text.contains("Written and saved"));
|
||||||
|
|
||||||
|
// Check we find a few numbers we expect in there
|
||||||
|
assertTrue(text, text.contains("15"));
|
||||||
|
assertTrue(text, text.contains("169"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testStrings() {
|
public void testStrings() {
|
||||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||||
@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||||||
// TODO Find some then test
|
// TODO Find some then test
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFormattedNumbers() {
|
public void testFormattedNumbersExcel4() {
|
||||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||||
String text = extractor.getText();
|
String text = extractor.getText();
|
||||||
|
|
||||||
@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||||||
// assertTrue(text, text.contains("55,624"));
|
// assertTrue(text, text.contains("55,624"));
|
||||||
// assertTrue(text, text.contains("11,743,477"));
|
// assertTrue(text, text.contains("11,743,477"));
|
||||||
}
|
}
|
||||||
|
public void DISABLEDtestFormattedNumbersExcel5() {
|
||||||
|
for (String ver : new String[] {"5", "95"}) {
|
||||||
|
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
|
||||||
|
String text = extractor.getText();
|
||||||
|
|
||||||
|
// Simple numbers
|
||||||
|
assertTrue(text, text.contains("1"));
|
||||||
|
|
||||||
|
// Numbers which come from formulas
|
||||||
|
assertTrue(text, text.contains("13"));
|
||||||
|
assertTrue(text, text.contains("169"));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
BIN
test-data/spreadsheet/testEXCEL_5.xls
Normal file
BIN
test-data/spreadsheet/testEXCEL_5.xls
Normal file
Binary file not shown.
BIN
test-data/spreadsheet/testEXCEL_95.xls
Normal file
BIN
test-data/spreadsheet/testEXCEL_95.xls
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user