Patch from Ugo from bug #44185 - support getting shared strings for ooxml excel files, and further tests for the ooxml excel text extraction
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@610074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d6a4075aa7
commit
7096e660c3
@ -18,6 +18,7 @@ package org.apache.poi.hssf;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.hssf.model.SharedStringsTable;
|
||||||
import org.apache.poi.hxf.HXFDocument;
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.openxml4j.exceptions.OpenXML4JException;
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
@ -45,14 +46,24 @@ public class HSSFXML extends HXFDocument {
|
|||||||
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml";
|
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml";
|
||||||
public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml";
|
public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml";
|
||||||
public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml";
|
public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml";
|
||||||
|
public static final String SHARED_STRINGS_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings";
|
||||||
|
|
||||||
private WorkbookDocument workbookDoc;
|
private WorkbookDocument workbookDoc;
|
||||||
|
|
||||||
|
private SharedStringsTable sharedStrings;
|
||||||
|
|
||||||
public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException {
|
public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException {
|
||||||
super(container, MAIN_CONTENT_TYPE);
|
super(container, MAIN_CONTENT_TYPE);
|
||||||
|
|
||||||
workbookDoc =
|
workbookDoc =
|
||||||
WorkbookDocument.Factory.parse(basePart.getInputStream());
|
WorkbookDocument.Factory.parse(basePart.getInputStream());
|
||||||
|
|
||||||
|
PackagePart ssPart = getSinglePartByRelationType(SHARED_STRINGS_RELATION_TYPE, basePart);
|
||||||
|
if (ssPart != null) {
|
||||||
|
sharedStrings = new SharedStringsTable(ssPart);
|
||||||
|
} else {
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -81,4 +92,8 @@ public class HSSFXML extends HXFDocument {
|
|||||||
WorksheetDocument.Factory.parse(sheetPart.getInputStream());
|
WorksheetDocument.Factory.parse(sheetPart.getInputStream());
|
||||||
return sheetDoc.getWorksheet();
|
return sheetDoc.getWorksheet();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getSharedString(int index) {
|
||||||
|
return this.sharedStrings.get(index);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -117,7 +117,7 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(!done) {
|
if(!done) {
|
||||||
HSSFXMLCell uCell = new HSSFXMLCell(cell);
|
HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook);
|
||||||
text.append(uCell.getStringValue());
|
text.append(uCell.getStringValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,32 +17,40 @@
|
|||||||
package org.apache.poi.hssf.usermodel;
|
package org.apache.poi.hssf.usermodel;
|
||||||
|
|
||||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* User facing wrapper around an underlying cell object
|
* User facing wrapper around an underlying cell object
|
||||||
*/
|
*/
|
||||||
public class HSSFXMLCell {
|
public class HSSFXMLCell {
|
||||||
private CTCell cell;
|
private CTCell cell;
|
||||||
public HSSFXMLCell(CTCell rawCell) {
|
|
||||||
this.cell = rawCell;
|
/** The workbook to which this cell belongs */
|
||||||
}
|
private final HSSFXMLWorkbook workbook;
|
||||||
|
|
||||||
/**
|
public HSSFXMLCell(CTCell rawCell, HSSFXMLWorkbook workbook) {
|
||||||
* Formats the cell's contents, based on its type,
|
this.cell = rawCell;
|
||||||
* and returns it as a string.
|
this.workbook = workbook;
|
||||||
*/
|
}
|
||||||
public String getStringValue() {
|
|
||||||
if(cell.getV() != null) {
|
/**
|
||||||
return cell.getV();
|
* Formats the cell's contents, based on its type,
|
||||||
}
|
* and returns it as a string.
|
||||||
if(cell.getIs() != null) {
|
*/
|
||||||
return cell.getIs().getT();
|
public String getStringValue() {
|
||||||
}
|
|
||||||
// TODO: Formatting
|
switch (cell.getT().intValue()) {
|
||||||
return Long.toString(cell.getS());
|
case STCellType.INT_S:
|
||||||
}
|
return this.workbook.getSharedString(Integer.valueOf(cell.getV()));
|
||||||
|
case STCellType.INT_N:
|
||||||
public String toString() {
|
return cell.getV();
|
||||||
return cell.getR() + " - " + getStringValue();
|
// TODO: support other types
|
||||||
}
|
default:
|
||||||
|
return "UNSUPPORTED CELL TYPE: '" + cell.getT() + "'";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return cell.getR() + " - " + getStringValue();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,4 +36,8 @@ public class HSSFXMLWorkbook extends POIXMLDocument {
|
|||||||
public HSSFXML _getHSSFXML() {
|
public HSSFXML _getHSSFXML() {
|
||||||
return hssfXML;
|
return hssfXML;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getSharedString(int index) {
|
||||||
|
return hssfXML.getSharedString(index);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -102,6 +102,27 @@ public abstract class HXFDocument {
|
|||||||
return parts.get(0);
|
return parts.get(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches the (single) PackagePart which is defined as
|
||||||
|
* the supplied relation content type of the specified part,
|
||||||
|
* or null if none found.
|
||||||
|
* @param relationType The relation content type to search for
|
||||||
|
* @throws IllegalArgumentException If we find more than one part of that type
|
||||||
|
* TODO: this sucks! Make Package and PackagePart implement common intf that defines getRelationshipsByType & friends
|
||||||
|
*/
|
||||||
|
protected PackagePart getSinglePartByRelationType(String relationType, PackagePart part) throws IllegalArgumentException, OpenXML4JException {
|
||||||
|
PackageRelationshipCollection rels =
|
||||||
|
part.getRelationshipsByType(relationType);
|
||||||
|
if(rels.size() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if(rels.size() > 1) {
|
||||||
|
throw new IllegalArgumentException("Found " + rels.size() + " relations for the type " + relationType + ", should only ever be one!");
|
||||||
|
}
|
||||||
|
PackageRelationship rel = rels.getRelationship(0);
|
||||||
|
return getPackagePart(rel);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches the (single) PackagePart which is defined as
|
* Fetches the (single) PackagePart which is defined as
|
||||||
* the supplied relation content type of the base
|
* the supplied relation content type of the base
|
||||||
@ -109,7 +130,7 @@ public abstract class HXFDocument {
|
|||||||
* @param relationType The relation content type to search for
|
* @param relationType The relation content type to search for
|
||||||
* @throws IllegalArgumentException If we find more than one part of that type
|
* @throws IllegalArgumentException If we find more than one part of that type
|
||||||
*/
|
*/
|
||||||
private PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException {
|
protected PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException {
|
||||||
PackageRelationshipCollection rels =
|
PackageRelationshipCollection rels =
|
||||||
container.getRelationshipsByType(relationType);
|
container.getRelationshipsByType(relationType);
|
||||||
if(rels.size() == 0) {
|
if(rels.size() == 0) {
|
||||||
|
@ -18,6 +18,8 @@ package org.apache.poi.hssf.extractor;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
@ -170,7 +172,7 @@ public class TestHXFExcelExtractor extends TestCase {
|
|||||||
* ExcelExtractor does, when we're both passed
|
* ExcelExtractor does, when we're both passed
|
||||||
* the same file, just saved as xls and xlsx
|
* the same file, just saved as xls and xlsx
|
||||||
*/
|
*/
|
||||||
public void BROKENtestComparedToOLE2() throws Exception {
|
public void testComparedToOLE2() throws Exception {
|
||||||
HXFExcelExtractor ooxmlExtractor =
|
HXFExcelExtractor ooxmlExtractor =
|
||||||
new HXFExcelExtractor(simpleXLSX.getPackage());
|
new HXFExcelExtractor(simpleXLSX.getPackage());
|
||||||
ExcelExtractor ole2Extractor =
|
ExcelExtractor ole2Extractor =
|
||||||
@ -181,14 +183,13 @@ public class TestHXFExcelExtractor extends TestCase {
|
|||||||
for (int i = 0; i < extractors.length; i++) {
|
for (int i = 0; i < extractors.length; i++) {
|
||||||
POITextExtractor extractor = extractors[i];
|
POITextExtractor extractor = extractors[i];
|
||||||
|
|
||||||
String text = extractor.getText().replace("\r", "");
|
String text = extractor.getText().replaceAll("[\r\t]", "");
|
||||||
System.out.println(text.length());
|
System.out.println(text.length());
|
||||||
System.out.println(text);
|
System.out.println(text);
|
||||||
assertTrue(text.startsWith("First Sheet\nTest spreadsheet\t\n2nd row\t2nd row 2nd column\n"));
|
assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
|
||||||
assertTrue(text.endsWith("13.0\nSheet3\n"));
|
Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
|
||||||
|
Matcher m = pattern.matcher(text);
|
||||||
assertTrue(text.length() >= 214);
|
assertTrue(m.matches());
|
||||||
assertTrue(text.length() <= 214);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user