diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java index 47865a86e..a482049b6 100644 --- a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java @@ -78,6 +78,8 @@ import org.xml.sax.helpers.DefaultHandler; * */ public class ReadOnlySharedStringsTable extends DefaultHandler { + + private final boolean includePhoneticRuns; /** * An integer representing the total count of strings in the workbook. This count does not * include any numbers, it counts only the total of text strings in the workbook. @@ -103,12 +105,29 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { private Map phoneticStrings; /** + * Calls {{@link #ReadOnlySharedStringsTable(OPCPackage, boolean)}} with + * a value of true for including phonetic runs + * * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table. * @throws IOException If reading the data from the package fails. * @throws SAXException if parsing the XML data fails. */ public ReadOnlySharedStringsTable(OPCPackage pkg) throws IOException, SAXException { + this(pkg, true); + } + + /** + * + * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table. + * @param includePhoneticRuns whether or not to concatenate phoneticRuns onto the shared string + * @since POI 3.14-Beta3 + * @throws IOException If reading the data from the package fails. + * @throws SAXException if parsing the XML data fails. + */ + public ReadOnlySharedStringsTable(OPCPackage pkg, boolean includePhoneticRuns) + throws IOException, SAXException { + this.includePhoneticRuns = includePhoneticRuns; ArrayList parts = pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType()); @@ -121,10 +140,24 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { /** * Like POIXMLDocumentPart constructor - * + * + * Calls {@link #ReadOnlySharedStringsTable(PackagePart, boolean)}, with a + * value of true to include phonetic runs. + * * @since POI 3.14-Beta1 */ public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException { + this(part, true); + } + + /** + * Like POIXMLDocumentPart constructor + * + * @since POI 3.14-Beta3 + */ + public ReadOnlySharedStringsTable(PackagePart part, boolean includePhoneticRuns) + throws IOException, SAXException { + this.includePhoneticRuns = includePhoneticRuns; readFrom(part.getInputStream()); } @@ -184,22 +217,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { return strings.get(idx); } - /** - * Return the phonetic string at a given index. - * Returns null if no phonetic string - * exists at that index. - * @param idx - * @return - */ - public String getPhoneticStringAt(int idx) { - //avoid an NPE. If the parser hasn't - //yet hit phoneticStrings could be null - if (phoneticStrings == null) { - return null; - } - return phoneticStrings.get(idx); - } - public List getItems() { return strings; } @@ -207,7 +224,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { //// ContentHandler methods //// private StringBuffer characters; - private StringBuffer rphCharacters; private boolean tIsOpen; private boolean inRPh; @@ -226,13 +242,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { this.strings = new ArrayList(this.uniqueCount); this.phoneticStrings = new HashMap(); characters = new StringBuffer(); - rphCharacters = new StringBuffer(); } else if ("si".equals(localName)) { characters.setLength(0); } else if ("t".equals(localName)) { tIsOpen = true; } else if ("rPh".equals(localName)) { inRPh = true; + //append space...this assumes that rPh always comes after regular + if (includePhoneticRuns && characters.length() > 0) { + characters.append(" "); + } } } @@ -244,10 +263,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { if ("si".equals(localName)) { strings.add(characters.toString()); - if (rphCharacters.length() > 0) { - phoneticStrings.put(strings.size()-1, rphCharacters.toString()); - rphCharacters.setLength(0); - } } else if ("t".equals(localName)) { tIsOpen = false; } else if ("rPh".equals(localName)) { @@ -261,9 +276,9 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { public void characters(char[] ch, int start, int length) throws SAXException { if (tIsOpen) { - if (inRPh) { - rphCharacters.append(ch, start, length); - } else { + if (inRPh && includePhoneticRuns) { + characters.append(ch, start, length); + } else if (! inRPh){ characters.append(ch, start, length); } } diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java index 18db97f43..e49c11c2e 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.xssf.extractor; +import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; @@ -23,8 +24,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; -import javax.xml.parsers.ParserConfigurationException; - import org.apache.poi.POIXMLProperties; import org.apache.poi.POIXMLProperties.CoreProperties; import org.apache.poi.POIXMLProperties.CustomProperties; @@ -64,6 +63,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor private boolean includeCellComments = false; private boolean includeHeadersFooters = true; private boolean formulasNotResults = false; + private boolean concatenatePhoneticRuns = true; public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { this(OPCPackage.open(path)); @@ -120,6 +120,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor this.includeCellComments = includeCellComments; } + /** + * Concatenate text from <rPh> text elements in SharedStringsTable + * Default is true; + * @param concatenatePhoneticRuns + */ + public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { + this.concatenatePhoneticRuns = concatenatePhoneticRuns; + } public void setLocale(Locale locale) { this.locale = locale; } @@ -189,7 +197,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor */ public String getText() { try { - ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container); + ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container, concatenatePhoneticRuns); XSSFReader xssfReader = new XSSFReader(container); StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); diff --git a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java index 060c5a80d..4b5e62cc5 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java @@ -59,19 +59,27 @@ public final class TestReadOnlySharedStringsTable extends TestCase { } + //51519 public void testPhoneticRuns() throws Exception { OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx")); List parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")); assertEquals(1, parts.size()); - ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0)); + ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0), true); List strings = rtbl.getItems(); assertEquals(49, strings.size()); assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0)); - assertNull(rtbl.getPhoneticStringAt(0)); + assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3", rtbl.getEntryAt(3)); + + //now do not include phonetic runs + rtbl = new ReadOnlySharedStringsTable(parts.get(0), false); + strings = rtbl.getItems(); + assertEquals(49, strings.size()); + + assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0)); assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3)); - assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3)); + } public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception { diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java index 73d22aeeb..b6d411d50 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -359,4 +360,25 @@ public class TestXSSFEventBasedExcelExtractor { assertTrue("can't find 10/02/2016", text.contains("10/02/2016")); ex.close(); } + + @Test + public void test51519() throws Exception { + //default behavior: include phonetic runs + XSSFEventBasedExcelExtractor ex = + new XSSFEventBasedExcelExtractor( + XSSFTestDataSamples.openSamplePackage("51519.xlsx")); + String text = ex.getText(); + assertTrue("can't find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3")); + ex.close(); + + //now try turning them off + ex = + new XSSFEventBasedExcelExtractor( + XSSFTestDataSamples.openSamplePackage("51519.xlsx")); + ex.setConcatenatePhoneticRuns(false); + text = ex.getText(); + assertFalse("should not be able to find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3")); + ex.close(); + + } }