Improve the xlsx text extraction, and have proper tests for it

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607063 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-12-27 13:02:17 +00:00
parent 59f37853cd
commit 1f1575e1be
3 changed files with 125 additions and 6 deletions

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
@ -89,16 +90,18 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
text.append("\t");
}
boolean done = false;
// Is it a formula one?
if(cell.getF() != null) {
if(formulasNotResults) {
text.append(cell.getF().getStringValue());
} else {
text.append(cell.getV());
done = true;
}
} else {
// Probably just want the v value
text.append(cell.getV());
}
if(!done) {
HSSFXMLCell uCell = new HSSFXMLCell(cell);
text.append(uCell.getStringValue());
}
}
text.append("\n");

View File

@ -0,0 +1,48 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.usermodel;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
/**
* User facing wrapper around an underlying cell object
*/
public class HSSFXMLCell {
private CTCell cell;
public HSSFXMLCell(CTCell rawCell) {
this.cell = rawCell;
}
/**
* Formats the cell's contents, based on its type,
* and returns it as a string.
*/
public String getStringValue() {
if(cell.getV() != null) {
return cell.getV();
}
if(cell.getIs() != null) {
return cell.getIs().getT();
}
// TODO: Formatting
return Long.toString(cell.getS());
}
public String toString() {
return cell.getR() + " - " + getStringValue();
}
}

View File

@ -66,10 +66,78 @@ public class TestHXFExcelExtractor extends TestCase {
String text = extractor.getText();
assertTrue(text.length() > 0);
System.err.println(text);
// Check sheet names
assertTrue(text.startsWith("Sheet1"));
assertTrue(text.endsWith("Sheet3\n"));
// Now without, will have text
extractor.setIncludeSheetNames(false);
text = extractor.getText();
assertEquals(
"0\t111\n" +
"1\t222\n" +
"2\t333\n" +
"3\t444\n" +
"4\t555\n" +
"5\t666\n" +
"6\t777\n" +
"7\t888\n" +
"8\t999\n" +
"9\t4995\n" +
"\n\n", text);
// Now get formulas not their values
extractor.setFormulasNotResults(true);
text = extractor.getText();
assertEquals(
"0\t111\n" +
"1\t222\n" +
"2\t333\n" +
"3\t444\n" +
"4\t555\n" +
"5\t666\n" +
"6\t777\n" +
"7\t888\n" +
"8\t999\n" +
"9\tSUM(B1:B9)\n" +
"\n\n", text);
// With sheet names too
extractor.setIncludeSheetNames(true);
text = extractor.getText();
assertEquals(
"Sheet1\n" +
"0\t111\n" +
"1\t222\n" +
"2\t333\n" +
"3\t444\n" +
"4\t555\n" +
"5\t666\n" +
"6\t777\n" +
"7\t888\n" +
"8\t999\n" +
"9\tSUM(B1:B9)\n\n" +
"Sheet2\n\n" +
"Sheet3\n"
, text);
}
public void testGetComplexText() throws Exception {
new HXFExcelExtractor(xmlB.getPackage());
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlB));
HXFExcelExtractor extractor =
new HXFExcelExtractor(xmlB.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
// Might not have all formatting it should do!
assertTrue(text.startsWith(
"Avgtxfull\n" +
"3\t13\t3\t2\t2\t3\t2\t"
));
}
}