diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java index 4de27401d..47865a86e 100644 --- a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java @@ -18,13 +18,14 @@ package org.apache.poi.xssf.eventusermodel; import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML; +import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; - -import javax.xml.parsers.ParserConfigurationException; +import java.util.Map; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; @@ -95,6 +96,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { */ private List strings; + /** + * Map of phonetic strings (if they exist) indexed + * with the integer matching the index in strings + */ + private Map phoneticStrings; + /** * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table. * @throws IOException If reading the data from the package fails. @@ -177,6 +184,22 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { return strings.get(idx); } + /** + * Return the phonetic string at a given index. + * Returns null if no phonetic string + * exists at that index. + * @param idx + * @return + */ + public String getPhoneticStringAt(int idx) { + //avoid an NPE. If the parser hasn't + //yet hit phoneticStrings could be null + if (phoneticStrings == null) { + return null; + } + return phoneticStrings.get(idx); + } + public List getItems() { return strings; } @@ -184,14 +207,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { //// ContentHandler methods //// private StringBuffer characters; + private StringBuffer rphCharacters; private boolean tIsOpen; + private boolean inRPh; public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if (uri != null && ! uri.equals(NS_SPREADSHEETML)) { return; } - + if ("sst".equals(localName)) { String count = attributes.getValue("count"); if(count != null) this.count = Integer.parseInt(count); @@ -199,12 +224,15 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount); this.strings = new ArrayList(this.uniqueCount); - + this.phoneticStrings = new HashMap(); characters = new StringBuffer(); + rphCharacters = new StringBuffer(); } else if ("si".equals(localName)) { characters.setLength(0); } else if ("t".equals(localName)) { tIsOpen = true; + } else if ("rPh".equals(localName)) { + inRPh = true; } } @@ -213,11 +241,17 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { if (uri != null && ! uri.equals(NS_SPREADSHEETML)) { return; } - + if ("si".equals(localName)) { strings.add(characters.toString()); + if (rphCharacters.length() > 0) { + phoneticStrings.put(strings.size()-1, rphCharacters.toString()); + rphCharacters.setLength(0); + } } else if ("t".equals(localName)) { - tIsOpen = false; + tIsOpen = false; + } else if ("rPh".equals(localName)) { + inRPh = false; } } @@ -226,8 +260,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { */ public void characters(char[] ch, int start, int length) throws SAXException { - if (tIsOpen) - characters.append(ch, start, length); + if (tIsOpen) { + if (inRPh) { + rphCharacters.append(ch, start, length); + } else { + characters.append(ch, start, length); + } + } } - } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java index 98b51b131..060c5a80d 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java @@ -19,8 +19,11 @@ package org.apache.poi.xssf.eventusermodel; -import junit.framework.TestCase; +import java.io.IOException; +import java.util.List; +import java.util.regex.Pattern; +import junit.framework.TestCase; import org.apache.poi.POIDataSamples; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; @@ -29,10 +32,6 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst; import org.xml.sax.SAXException; -import java.io.IOException; -import java.util.List; -import java.util.regex.Pattern; - /** * Tests for {@link org.apache.poi.xssf.eventusermodel.XSSFReader} */ @@ -59,7 +58,22 @@ public final class TestReadOnlySharedStringsTable extends TestCase { } } - + + public void testPhoneticRuns() throws Exception { + OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx")); + List parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")); + assertEquals(1, parts.size()); + + ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0)); + List strings = rtbl.getItems(); + assertEquals(49, strings.size()); + + assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0)); + assertNull(rtbl.getPhoneticStringAt(0)); + assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3)); + assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3)); + } + public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception { XSSFWorkbook wb = new XSSFWorkbook(_ssTests.openResourceAsStream("noSharedStringTable.xlsx")); OPCPackage pkg = wb.getPackage(); diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java index 89a4e4047..d82ac6205 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java @@ -22,7 +22,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import junit.framework.TestCase; - import org.apache.poi.POITextExtractor; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.extractor.ExcelExtractor; @@ -226,4 +225,18 @@ public class TestXSSFExcelExtractor extends TestCase { extractor.close(); } } + + public void testPhoneticRuns() throws Exception { + XSSFExcelExtractor extractor = getExtractor("51519.xlsx"); + try { + String text = extractor.getText(); + assertTrue(text.contains("\u8C4A\u7530")); + //this shows up only as a phonetic run and should not appear + //in the extracted text + assertFalse(text.contains("\u30CB\u30DB\u30F3")); + } finally { + extractor.close(); + } + + } } diff --git a/test-data/spreadsheet/51519.xlsx b/test-data/spreadsheet/51519.xlsx new file mode 100644 index 000000000..ba56dc610 Binary files /dev/null and b/test-data/spreadsheet/51519.xlsx differ