51519 -- follow on, make concatenation of rPh configurable
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1786021 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5bb1195606
commit
b442671274
@ -78,6 +78,8 @@ import org.xml.sax.helpers.DefaultHandler;
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class ReadOnlySharedStringsTable extends DefaultHandler {
|
public class ReadOnlySharedStringsTable extends DefaultHandler {
|
||||||
|
|
||||||
|
private final boolean includePhoneticRuns;
|
||||||
/**
|
/**
|
||||||
* An integer representing the total count of strings in the workbook. This count does not
|
* An integer representing the total count of strings in the workbook. This count does not
|
||||||
* include any numbers, it counts only the total of text strings in the workbook.
|
* include any numbers, it counts only the total of text strings in the workbook.
|
||||||
@ -103,12 +105,29 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
private Map<Integer, String> phoneticStrings;
|
private Map<Integer, String> phoneticStrings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Calls {{@link #ReadOnlySharedStringsTable(OPCPackage, boolean)}} with
|
||||||
|
* a value of <code>true</code> for including phonetic runs
|
||||||
|
*
|
||||||
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
|
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
|
||||||
* @throws IOException If reading the data from the package fails.
|
* @throws IOException If reading the data from the package fails.
|
||||||
* @throws SAXException if parsing the XML data fails.
|
* @throws SAXException if parsing the XML data fails.
|
||||||
*/
|
*/
|
||||||
public ReadOnlySharedStringsTable(OPCPackage pkg)
|
public ReadOnlySharedStringsTable(OPCPackage pkg)
|
||||||
throws IOException, SAXException {
|
throws IOException, SAXException {
|
||||||
|
this(pkg, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
|
||||||
|
* @param includePhoneticRuns whether or not to concatenate phoneticRuns onto the shared string
|
||||||
|
* @since POI 3.14-Beta3
|
||||||
|
* @throws IOException If reading the data from the package fails.
|
||||||
|
* @throws SAXException if parsing the XML data fails.
|
||||||
|
*/
|
||||||
|
public ReadOnlySharedStringsTable(OPCPackage pkg, boolean includePhoneticRuns)
|
||||||
|
throws IOException, SAXException {
|
||||||
|
this.includePhoneticRuns = includePhoneticRuns;
|
||||||
ArrayList<PackagePart> parts =
|
ArrayList<PackagePart> parts =
|
||||||
pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
|
pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
|
||||||
|
|
||||||
@ -122,9 +141,23 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
/**
|
/**
|
||||||
* Like POIXMLDocumentPart constructor
|
* Like POIXMLDocumentPart constructor
|
||||||
*
|
*
|
||||||
|
* Calls {@link #ReadOnlySharedStringsTable(PackagePart, boolean)}, with a
|
||||||
|
* value of <code>true</code> to include phonetic runs.
|
||||||
|
*
|
||||||
* @since POI 3.14-Beta1
|
* @since POI 3.14-Beta1
|
||||||
*/
|
*/
|
||||||
public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException {
|
public ReadOnlySharedStringsTable(PackagePart part) throws IOException, SAXException {
|
||||||
|
this(part, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Like POIXMLDocumentPart constructor
|
||||||
|
*
|
||||||
|
* @since POI 3.14-Beta3
|
||||||
|
*/
|
||||||
|
public ReadOnlySharedStringsTable(PackagePart part, boolean includePhoneticRuns)
|
||||||
|
throws IOException, SAXException {
|
||||||
|
this.includePhoneticRuns = includePhoneticRuns;
|
||||||
readFrom(part.getInputStream());
|
readFrom(part.getInputStream());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,22 +217,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
return strings.get(idx);
|
return strings.get(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the phonetic string at a given index.
|
|
||||||
* Returns <code>null</code> if no phonetic string
|
|
||||||
* exists at that index.
|
|
||||||
* @param idx
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public String getPhoneticStringAt(int idx) {
|
|
||||||
//avoid an NPE. If the parser hasn't
|
|
||||||
//yet hit <sst/> phoneticStrings could be null
|
|
||||||
if (phoneticStrings == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return phoneticStrings.get(idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getItems() {
|
public List<String> getItems() {
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
@ -207,7 +224,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
//// ContentHandler methods ////
|
//// ContentHandler methods ////
|
||||||
|
|
||||||
private StringBuffer characters;
|
private StringBuffer characters;
|
||||||
private StringBuffer rphCharacters;
|
|
||||||
private boolean tIsOpen;
|
private boolean tIsOpen;
|
||||||
private boolean inRPh;
|
private boolean inRPh;
|
||||||
|
|
||||||
@ -226,13 +242,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
this.strings = new ArrayList<String>(this.uniqueCount);
|
this.strings = new ArrayList<String>(this.uniqueCount);
|
||||||
this.phoneticStrings = new HashMap<Integer, String>();
|
this.phoneticStrings = new HashMap<Integer, String>();
|
||||||
characters = new StringBuffer();
|
characters = new StringBuffer();
|
||||||
rphCharacters = new StringBuffer();
|
|
||||||
} else if ("si".equals(localName)) {
|
} else if ("si".equals(localName)) {
|
||||||
characters.setLength(0);
|
characters.setLength(0);
|
||||||
} else if ("t".equals(localName)) {
|
} else if ("t".equals(localName)) {
|
||||||
tIsOpen = true;
|
tIsOpen = true;
|
||||||
} else if ("rPh".equals(localName)) {
|
} else if ("rPh".equals(localName)) {
|
||||||
inRPh = true;
|
inRPh = true;
|
||||||
|
//append space...this assumes that rPh always comes after regular <t>
|
||||||
|
if (includePhoneticRuns && characters.length() > 0) {
|
||||||
|
characters.append(" ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,10 +263,6 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
|
|
||||||
if ("si".equals(localName)) {
|
if ("si".equals(localName)) {
|
||||||
strings.add(characters.toString());
|
strings.add(characters.toString());
|
||||||
if (rphCharacters.length() > 0) {
|
|
||||||
phoneticStrings.put(strings.size()-1, rphCharacters.toString());
|
|
||||||
rphCharacters.setLength(0);
|
|
||||||
}
|
|
||||||
} else if ("t".equals(localName)) {
|
} else if ("t".equals(localName)) {
|
||||||
tIsOpen = false;
|
tIsOpen = false;
|
||||||
} else if ("rPh".equals(localName)) {
|
} else if ("rPh".equals(localName)) {
|
||||||
@ -261,9 +276,9 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
public void characters(char[] ch, int start, int length)
|
public void characters(char[] ch, int start, int length)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
if (tIsOpen) {
|
if (tIsOpen) {
|
||||||
if (inRPh) {
|
if (inRPh && includePhoneticRuns) {
|
||||||
rphCharacters.append(ch, start, length);
|
characters.append(ch, start, length);
|
||||||
} else {
|
} else if (! inRPh){
|
||||||
characters.append(ch, start, length);
|
characters.append(ch, start, length);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.xssf.extractor;
|
package org.apache.poi.xssf.extractor;
|
||||||
|
|
||||||
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -23,8 +24,6 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
|
|
||||||
import org.apache.poi.POIXMLProperties;
|
import org.apache.poi.POIXMLProperties;
|
||||||
import org.apache.poi.POIXMLProperties.CoreProperties;
|
import org.apache.poi.POIXMLProperties.CoreProperties;
|
||||||
import org.apache.poi.POIXMLProperties.CustomProperties;
|
import org.apache.poi.POIXMLProperties.CustomProperties;
|
||||||
@ -64,6 +63,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
|||||||
private boolean includeCellComments = false;
|
private boolean includeCellComments = false;
|
||||||
private boolean includeHeadersFooters = true;
|
private boolean includeHeadersFooters = true;
|
||||||
private boolean formulasNotResults = false;
|
private boolean formulasNotResults = false;
|
||||||
|
private boolean concatenatePhoneticRuns = true;
|
||||||
|
|
||||||
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
|
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
|
||||||
this(OPCPackage.open(path));
|
this(OPCPackage.open(path));
|
||||||
@ -120,6 +120,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
|||||||
this.includeCellComments = includeCellComments;
|
this.includeCellComments = includeCellComments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Concatenate text from <rPh> text elements in SharedStringsTable
|
||||||
|
* Default is true;
|
||||||
|
* @param concatenatePhoneticRuns
|
||||||
|
*/
|
||||||
|
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
|
||||||
|
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
|
||||||
|
}
|
||||||
public void setLocale(Locale locale) {
|
public void setLocale(Locale locale) {
|
||||||
this.locale = locale;
|
this.locale = locale;
|
||||||
}
|
}
|
||||||
@ -189,7 +197,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
|||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
try {
|
try {
|
||||||
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
|
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container, concatenatePhoneticRuns);
|
||||||
XSSFReader xssfReader = new XSSFReader(container);
|
XSSFReader xssfReader = new XSSFReader(container);
|
||||||
StylesTable styles = xssfReader.getStylesTable();
|
StylesTable styles = xssfReader.getStylesTable();
|
||||||
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
|
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
|
||||||
|
@ -59,19 +59,27 @@ public final class TestReadOnlySharedStringsTable extends TestCase {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//51519
|
||||||
public void testPhoneticRuns() throws Exception {
|
public void testPhoneticRuns() throws Exception {
|
||||||
OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx"));
|
OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx"));
|
||||||
List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml"));
|
List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml"));
|
||||||
assertEquals(1, parts.size());
|
assertEquals(1, parts.size());
|
||||||
|
|
||||||
ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0));
|
ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0), true);
|
||||||
List<String> strings = rtbl.getItems();
|
List<String> strings = rtbl.getItems();
|
||||||
assertEquals(49, strings.size());
|
assertEquals(49, strings.size());
|
||||||
|
|
||||||
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
|
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
|
||||||
assertNull(rtbl.getPhoneticStringAt(0));
|
assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3", rtbl.getEntryAt(3));
|
||||||
|
|
||||||
|
//now do not include phonetic runs
|
||||||
|
rtbl = new ReadOnlySharedStringsTable(parts.get(0), false);
|
||||||
|
strings = rtbl.getItems();
|
||||||
|
assertEquals(49, strings.size());
|
||||||
|
|
||||||
|
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
|
||||||
assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3));
|
assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3));
|
||||||
assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {
|
public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package org.apache.poi.xssf.extractor;
|
package org.apache.poi.xssf.extractor;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
@ -359,4 +360,25 @@ public class TestXSSFEventBasedExcelExtractor {
|
|||||||
assertTrue("can't find 10/02/2016", text.contains("10/02/2016"));
|
assertTrue("can't find 10/02/2016", text.contains("10/02/2016"));
|
||||||
ex.close();
|
ex.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test51519() throws Exception {
|
||||||
|
//default behavior: include phonetic runs
|
||||||
|
XSSFEventBasedExcelExtractor ex =
|
||||||
|
new XSSFEventBasedExcelExtractor(
|
||||||
|
XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
|
||||||
|
String text = ex.getText();
|
||||||
|
assertTrue("can't find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3"));
|
||||||
|
ex.close();
|
||||||
|
|
||||||
|
//now try turning them off
|
||||||
|
ex =
|
||||||
|
new XSSFEventBasedExcelExtractor(
|
||||||
|
XSSFTestDataSamples.openSamplePackage("51519.xlsx"));
|
||||||
|
ex.setConcatenatePhoneticRuns(false);
|
||||||
|
text = ex.getText();
|
||||||
|
assertFalse("should not be able to find appended phonetic run", text.contains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3"));
|
||||||
|
ex.close();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user