From 7096e660c340279220f7e087e97a52d61d518f2e Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 8 Jan 2008 17:28:39 +0000 Subject: [PATCH] Patch from Ugo from bug #44185 - support getting shared strings for ooxml excel files, and further tests for the ooxml excel text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@610074 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/hssf/HSSFXML.java | 15 ++++++ .../poi/hssf/extractor/HXFExcelExtractor.java | 2 +- .../poi/hssf/usermodel/HSSFXMLCell.java | 54 +++++++++++-------- .../poi/hssf/usermodel/HSSFXMLWorkbook.java | 4 ++ .../org/apache/poi/hxf/HXFDocument.java | 23 +++++++- .../hssf/extractor/TestHXFExcelExtractor.java | 15 +++--- 6 files changed, 81 insertions(+), 32 deletions(-) diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/HSSFXML.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/HSSFXML.java index bf2b1b113..bb476c1e6 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/HSSFXML.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/HSSFXML.java @@ -18,6 +18,7 @@ package org.apache.poi.hssf; import java.io.IOException; +import org.apache.poi.hssf.model.SharedStringsTable; import org.apache.poi.hxf.HXFDocument; import org.apache.xmlbeans.XmlException; import org.openxml4j.exceptions.OpenXML4JException; @@ -45,14 +46,24 @@ public class HSSFXML extends HXFDocument { public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"; public static final String SHEET_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"; public static final String SHARED_STRINGS_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"; + public static final String SHARED_STRINGS_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"; private WorkbookDocument workbookDoc; + private SharedStringsTable sharedStrings; + public HSSFXML(Package container) throws OpenXML4JException, IOException, XmlException { super(container, MAIN_CONTENT_TYPE); workbookDoc = WorkbookDocument.Factory.parse(basePart.getInputStream()); + + PackagePart ssPart = getSinglePartByRelationType(SHARED_STRINGS_RELATION_TYPE, basePart); + if (ssPart != null) { + sharedStrings = new SharedStringsTable(ssPart); + } else { + + } } /** @@ -81,4 +92,8 @@ public class HSSFXML extends HXFDocument { WorksheetDocument.Factory.parse(sheetPart.getInputStream()); return sheetDoc.getWorksheet(); } + + public String getSharedString(int index) { + return this.sharedStrings.get(index); + } } diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java index 59f83d6d0..34ae06800 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/extractor/HXFExcelExtractor.java @@ -117,7 +117,7 @@ public class HXFExcelExtractor extends POIXMLTextExtractor { } } if(!done) { - HSSFXMLCell uCell = new HSSFXMLCell(cell); + HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook); text.append(uCell.getStringValue()); } } diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLCell.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLCell.java index b1b38283b..549f32eaa 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLCell.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLCell.java @@ -17,32 +17,40 @@ package org.apache.poi.hssf.usermodel; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType; /** * User facing wrapper around an underlying cell object */ public class HSSFXMLCell { - private CTCell cell; - public HSSFXMLCell(CTCell rawCell) { - this.cell = rawCell; - } - - /** - * Formats the cell's contents, based on its type, - * and returns it as a string. - */ - public String getStringValue() { - if(cell.getV() != null) { - return cell.getV(); - } - if(cell.getIs() != null) { - return cell.getIs().getT(); - } - // TODO: Formatting - return Long.toString(cell.getS()); - } - - public String toString() { - return cell.getR() + " - " + getStringValue(); - } + private CTCell cell; + + /** The workbook to which this cell belongs */ + private final HSSFXMLWorkbook workbook; + + public HSSFXMLCell(CTCell rawCell, HSSFXMLWorkbook workbook) { + this.cell = rawCell; + this.workbook = workbook; + } + + /** + * Formats the cell's contents, based on its type, + * and returns it as a string. + */ + public String getStringValue() { + + switch (cell.getT().intValue()) { + case STCellType.INT_S: + return this.workbook.getSharedString(Integer.valueOf(cell.getV())); + case STCellType.INT_N: + return cell.getV(); + // TODO: support other types + default: + return "UNSUPPORTED CELL TYPE: '" + cell.getT() + "'"; + } + } + + public String toString() { + return cell.getR() + " - " + getStringValue(); + } } diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java index 16b93f61f..023b80f4d 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hssf/usermodel/HSSFXMLWorkbook.java @@ -36,4 +36,8 @@ public class HSSFXMLWorkbook extends POIXMLDocument { public HSSFXML _getHSSFXML() { return hssfXML; } + + public String getSharedString(int index) { + return hssfXML.getSharedString(index); + } } diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hxf/HXFDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hxf/HXFDocument.java index 36a890b97..c2b2aa6d9 100644 --- a/src/scratchpad/ooxml-src/org/apache/poi/hxf/HXFDocument.java +++ b/src/scratchpad/ooxml-src/org/apache/poi/hxf/HXFDocument.java @@ -102,6 +102,27 @@ public abstract class HXFDocument { return parts.get(0); } + /** + * Fetches the (single) PackagePart which is defined as + * the supplied relation content type of the specified part, + * or null if none found. + * @param relationType The relation content type to search for + * @throws IllegalArgumentException If we find more than one part of that type + * TODO: this sucks! Make Package and PackagePart implement common intf that defines getRelationshipsByType & friends + */ + protected PackagePart getSinglePartByRelationType(String relationType, PackagePart part) throws IllegalArgumentException, OpenXML4JException { + PackageRelationshipCollection rels = + part.getRelationshipsByType(relationType); + if(rels.size() == 0) { + return null; + } + if(rels.size() > 1) { + throw new IllegalArgumentException("Found " + rels.size() + " relations for the type " + relationType + ", should only ever be one!"); + } + PackageRelationship rel = rels.getRelationship(0); + return getPackagePart(rel); + } + /** * Fetches the (single) PackagePart which is defined as * the supplied relation content type of the base @@ -109,7 +130,7 @@ public abstract class HXFDocument { * @param relationType The relation content type to search for * @throws IllegalArgumentException If we find more than one part of that type */ - private PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException { + protected PackagePart getSinglePartByRelationType(String relationType) throws IllegalArgumentException, OpenXML4JException { PackageRelationshipCollection rels = container.getRelationshipsByType(relationType); if(rels.size() == 0) { diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java index d7fc4dc3a..f47639bf5 100644 --- a/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hssf/extractor/TestHXFExcelExtractor.java @@ -18,6 +18,8 @@ package org.apache.poi.hssf.extractor; import java.io.File; import java.io.FileInputStream; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import junit.framework.TestCase; @@ -170,7 +172,7 @@ public class TestHXFExcelExtractor extends TestCase { * ExcelExtractor does, when we're both passed * the same file, just saved as xls and xlsx */ - public void BROKENtestComparedToOLE2() throws Exception { + public void testComparedToOLE2() throws Exception { HXFExcelExtractor ooxmlExtractor = new HXFExcelExtractor(simpleXLSX.getPackage()); ExcelExtractor ole2Extractor = @@ -181,14 +183,13 @@ public class TestHXFExcelExtractor extends TestCase { for (int i = 0; i < extractors.length; i++) { POITextExtractor extractor = extractors[i]; - String text = extractor.getText().replace("\r", ""); + String text = extractor.getText().replaceAll("[\r\t]", ""); System.out.println(text.length()); System.out.println(text); - assertTrue(text.startsWith("First Sheet\nTest spreadsheet\t\n2nd row\t2nd row 2nd column\n")); - assertTrue(text.endsWith("13.0\nSheet3\n")); - - assertTrue(text.length() >= 214); - assertTrue(text.length() <= 214); + assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n")); + Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL); + Matcher m = pattern.matcher(text); + assertTrue(m.matches()); } } }