Patch from Ugo from bug #44185 - support getting shared strings for ooxml excel files, and further tests for the ooxml excel text extraction

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@610074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-01-08 17:28:39 +00:00
parent d6a4075aa7
commit 7096e660c3
6 changed files with 81 additions and 32 deletions

View File

@ -18,6 +18,7 @@ package org.apache.poi.hssf;
import java.io.IOException; import java.io.IOException;
import org.apache.poi.hssf.model.SharedStringsTable;
import org.apache.poi.hxf.HXFDocument; import org.apache.poi.hxf.HXFDocument;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.exceptions.OpenXML4JException;
@ -45,14 +46,24 @@ public class HSSFXML extends HXFDocument {
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"; public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml";
public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"; public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml";
public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"; public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml";
public static final String SHARED_STRINGS_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings";
private WorkbookDocument workbookDoc; private WorkbookDocument workbookDoc;
private SharedStringsTable sharedStrings;
public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException { public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException {
super(container, MAIN_CONTENT_TYPE); super(container, MAIN_CONTENT_TYPE);
workbookDoc = workbookDoc =
WorkbookDocument.Factory.parse(basePart.getInputStream()); WorkbookDocument.Factory.parse(basePart.getInputStream());
PackagePart ssPart = getSinglePartByRelationType(SHARED_STRINGS_RELATION_TYPE, basePart);
if (ssPart != null) {
sharedStrings = new SharedStringsTable(ssPart);
} else {
}
} }
/** /**
@ -81,4 +92,8 @@ public class HSSFXML extends HXFDocument {
WorksheetDocument.Factory.parse(sheetPart.getInputStream()); WorksheetDocument.Factory.parse(sheetPart.getInputStream());
return sheetDoc.getWorksheet(); return sheetDoc.getWorksheet();
} }
public String getSharedString(int index) {
return this.sharedStrings.get(index);
}
} }

View File

@ -117,7 +117,7 @@ public class HXFExcelExtractor extends POIXMLTextExtractor {
} }
} }
if(!done) { if(!done) {
HSSFXMLCell uCell = new HSSFXMLCell(cell); HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook);
text.append(uCell.getStringValue()); text.append(uCell.getStringValue());
} }
} }

View File

@ -17,32 +17,40 @@
package org.apache.poi.hssf.usermodel; package org.apache.poi.hssf.usermodel;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
/** /**
* User facing wrapper around an underlying cell object * User facing wrapper around an underlying cell object
*/ */
public class HSSFXMLCell { public class HSSFXMLCell {
private CTCell cell; private CTCell cell;
public HSSFXMLCell(CTCell rawCell) {
this.cell = rawCell; /** The workbook to which this cell belongs */
} private final HSSFXMLWorkbook workbook;
/** public HSSFXMLCell(CTCell rawCell, HSSFXMLWorkbook workbook) {
* Formats the cell's contents, based on its type, this.cell = rawCell;
* and returns it as a string. this.workbook = workbook;
*/ }
public String getStringValue() {
if(cell.getV() != null) { /**
return cell.getV(); * Formats the cell's contents, based on its type,
} * and returns it as a string.
if(cell.getIs() != null) { */
return cell.getIs().getT(); public String getStringValue() {
}
// TODO: Formatting switch (cell.getT().intValue()) {
return Long.toString(cell.getS()); case STCellType.INT_S:
} return this.workbook.getSharedString(Integer.valueOf(cell.getV()));
case STCellType.INT_N:
public String toString() { return cell.getV();
return cell.getR() + " - " + getStringValue(); // TODO: support other types
} default:
return "UNSUPPORTED CELL TYPE: '" + cell.getT() + "'";
}
}
public String toString() {
return cell.getR() + " - " + getStringValue();
}
} }

View File

@ -36,4 +36,8 @@ public class HSSFXMLWorkbook extends POIXMLDocument {
public HSSFXML _getHSSFXML() { public HSSFXML _getHSSFXML() {
return hssfXML; return hssfXML;
} }
public String getSharedString(int index) {
return hssfXML.getSharedString(index);
}
} }

View File

@ -102,6 +102,27 @@ public abstract class HXFDocument {
return parts.get(0); return parts.get(0);
} }
/**
* Fetches the (single) PackagePart which is defined as
* the supplied relation content type of the specified part,
* or null if none found.
* @param relationType The relation content type to search for
* @throws IllegalArgumentException If we find more than one part of that type
* TODO: this sucks! Make Package and PackagePart implement common intf that defines getRelationshipsByType & friends
*/
protected PackagePart getSinglePartByRelationType(String relationType, PackagePart part) throws IllegalArgumentException, OpenXML4JException {
PackageRelationshipCollection rels =
part.getRelationshipsByType(relationType);
if(rels.size() == 0) {
return null;
}
if(rels.size() > 1) {
throw new IllegalArgumentException("Found " + rels.size() + " relations for the type " + relationType + ", should only ever be one!");
}
PackageRelationship rel = rels.getRelationship(0);
return getPackagePart(rel);
}
/** /**
* Fetches the (single) PackagePart which is defined as * Fetches the (single) PackagePart which is defined as
* the supplied relation content type of the base * the supplied relation content type of the base
@ -109,7 +130,7 @@ public abstract class HXFDocument {
* @param relationType The relation content type to search for * @param relationType The relation content type to search for
* @throws IllegalArgumentException If we find more than one part of that type * @throws IllegalArgumentException If we find more than one part of that type
*/ */
private PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException { protected PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException {
PackageRelationshipCollection rels = PackageRelationshipCollection rels =
container.getRelationshipsByType(relationType); container.getRelationshipsByType(relationType);
if(rels.size() == 0) { if(rels.size() == 0) {

View File

@ -18,6 +18,8 @@ package org.apache.poi.hssf.extractor;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase; import junit.framework.TestCase;
@ -170,7 +172,7 @@ public class TestHXFExcelExtractor extends TestCase {
* ExcelExtractor does, when we're both passed * ExcelExtractor does, when we're both passed
* the same file, just saved as xls and xlsx * the same file, just saved as xls and xlsx
*/ */
public void BROKENtestComparedToOLE2() throws Exception { public void testComparedToOLE2() throws Exception {
HXFExcelExtractor ooxmlExtractor = HXFExcelExtractor ooxmlExtractor =
new HXFExcelExtractor(simpleXLSX.getPackage()); new HXFExcelExtractor(simpleXLSX.getPackage());
ExcelExtractor ole2Extractor = ExcelExtractor ole2Extractor =
@ -181,14 +183,13 @@ public class TestHXFExcelExtractor extends TestCase {
for (int i = 0; i < extractors.length; i++) { for (int i = 0; i < extractors.length; i++) {
POITextExtractor extractor = extractors[i]; POITextExtractor extractor = extractors[i];
String text = extractor.getText().replace("\r", ""); String text = extractor.getText().replaceAll("[\r\t]", "");
System.out.println(text.length()); System.out.println(text.length());
System.out.println(text); System.out.println(text);
assertTrue(text.startsWith("First Sheet\nTest spreadsheet\t\n2nd row\t2nd row 2nd column\n")); assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
assertTrue(text.endsWith("13.0\nSheet3\n")); Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
Matcher m = pattern.matcher(text);
assertTrue(text.length() >= 214); assertTrue(m.matches());
assertTrue(text.length() <= 214);
} }
} }
} }