Improve the xlsx text extraction, and have proper tests for it
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607063 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
59f37853cd
commit
1f1575e1be
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hssf.HSSFXML;
|
||||
import org.apache.poi.hssf.usermodel.HSSFXMLCell;
|
||||
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxml4j.exceptions.OpenXML4JException;
|
||||
@ -89,16 +90,18 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
|
||||
text.append("\t");
|
||||
}
|
||||
|
||||
boolean done = false;
|
||||
|
||||
// Is it a formula one?
|
||||
if(cell.getF() != null) {
|
||||
if(formulasNotResults) {
|
||||
text.append(cell.getF().getStringValue());
|
||||
} else {
|
||||
text.append(cell.getV());
|
||||
done = true;
|
||||
}
|
||||
} else {
|
||||
// Probably just want the v value
|
||||
text.append(cell.getV());
|
||||
}
|
||||
if(!done) {
|
||||
HSSFXMLCell uCell = new HSSFXMLCell(cell);
|
||||
text.append(uCell.getStringValue());
|
||||
}
|
||||
}
|
||||
text.append("\n");
|
||||
|
@ -0,0 +1,48 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hssf.usermodel;
|
||||
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
|
||||
|
||||
/**
|
||||
* User facing wrapper around an underlying cell object
|
||||
*/
|
||||
public class HSSFXMLCell {
|
||||
private CTCell cell;
|
||||
public HSSFXMLCell(CTCell rawCell) {
|
||||
this.cell = rawCell;
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats the cell's contents, based on its type,
|
||||
* and returns it as a string.
|
||||
*/
|
||||
public String getStringValue() {
|
||||
if(cell.getV() != null) {
|
||||
return cell.getV();
|
||||
}
|
||||
if(cell.getIs() != null) {
|
||||
return cell.getIs().getT();
|
||||
}
|
||||
// TODO: Formatting
|
||||
return Long.toString(cell.getS());
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return cell.getR() + " - " + getStringValue();
|
||||
}
|
||||
}
|
@ -66,10 +66,78 @@ public class TestHXFExcelExtractor extends TestCase {
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
System.err.println(text);
|
||||
|
||||
// Check sheet names
|
||||
assertTrue(text.startsWith("Sheet1"));
|
||||
assertTrue(text.endsWith("Sheet3\n"));
|
||||
|
||||
// Now without, will have text
|
||||
extractor.setIncludeSheetNames(false);
|
||||
text = extractor.getText();
|
||||
assertEquals(
|
||||
"0\t111\n" +
|
||||
"1\t222\n" +
|
||||
"2\t333\n" +
|
||||
"3\t444\n" +
|
||||
"4\t555\n" +
|
||||
"5\t666\n" +
|
||||
"6\t777\n" +
|
||||
"7\t888\n" +
|
||||
"8\t999\n" +
|
||||
"9\t4995\n" +
|
||||
"\n\n", text);
|
||||
|
||||
// Now get formulas not their values
|
||||
extractor.setFormulasNotResults(true);
|
||||
text = extractor.getText();
|
||||
assertEquals(
|
||||
"0\t111\n" +
|
||||
"1\t222\n" +
|
||||
"2\t333\n" +
|
||||
"3\t444\n" +
|
||||
"4\t555\n" +
|
||||
"5\t666\n" +
|
||||
"6\t777\n" +
|
||||
"7\t888\n" +
|
||||
"8\t999\n" +
|
||||
"9\tSUM(B1:B9)\n" +
|
||||
"\n\n", text);
|
||||
|
||||
// With sheet names too
|
||||
extractor.setIncludeSheetNames(true);
|
||||
text = extractor.getText();
|
||||
assertEquals(
|
||||
"Sheet1\n" +
|
||||
"0\t111\n" +
|
||||
"1\t222\n" +
|
||||
"2\t333\n" +
|
||||
"3\t444\n" +
|
||||
"4\t555\n" +
|
||||
"5\t666\n" +
|
||||
"6\t777\n" +
|
||||
"7\t888\n" +
|
||||
"8\t999\n" +
|
||||
"9\tSUM(B1:B9)\n\n" +
|
||||
"Sheet2\n\n" +
|
||||
"Sheet3\n"
|
||||
, text);
|
||||
}
|
||||
|
||||
public void testGetComplexText() throws Exception {
|
||||
new HXFExcelExtractor(xmlB.getPackage());
|
||||
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlB));
|
||||
|
||||
HXFExcelExtractor extractor =
|
||||
new HXFExcelExtractor(xmlB.getPackage());
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
// Might not have all formatting it should do!
|
||||
assertTrue(text.startsWith(
|
||||
"Avgtxfull\n" +
|
||||
"3\t13\t3\t2\t2\t3\t2\t"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user