51519 -- allow users to ignore or include the <rPh> (phonetic run) element in the ReadOnlySharedStringsTable used in the SAX/streaming xlsx reader.
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1785965 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8dd5f7b887
commit
5bb1195606
@ -18,13 +18,14 @@ package org.apache.poi.xssf.eventusermodel;
|
|||||||
|
|
||||||
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
|
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
|
||||||
|
|
||||||
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.PushbackInputStream;
|
import java.io.PushbackInputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
|
||||||
|
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||||
@ -95,6 +96,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
*/
|
*/
|
||||||
private List<String> strings;
|
private List<String> strings;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map of phonetic strings (if they exist) indexed
|
||||||
|
* with the integer matching the index in strings
|
||||||
|
*/
|
||||||
|
private Map<Integer, String> phoneticStrings;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
|
* @param pkg The {@link OPCPackage} to use as basis for the shared-strings table.
|
||||||
* @throws IOException If reading the data from the package fails.
|
* @throws IOException If reading the data from the package fails.
|
||||||
@ -177,6 +184,22 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
return strings.get(idx);
|
return strings.get(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the phonetic string at a given index.
|
||||||
|
* Returns <code>null</code> if no phonetic string
|
||||||
|
* exists at that index.
|
||||||
|
* @param idx
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public String getPhoneticStringAt(int idx) {
|
||||||
|
//avoid an NPE. If the parser hasn't
|
||||||
|
//yet hit <sst/> phoneticStrings could be null
|
||||||
|
if (phoneticStrings == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return phoneticStrings.get(idx);
|
||||||
|
}
|
||||||
|
|
||||||
public List<String> getItems() {
|
public List<String> getItems() {
|
||||||
return strings;
|
return strings;
|
||||||
}
|
}
|
||||||
@ -184,7 +207,9 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
//// ContentHandler methods ////
|
//// ContentHandler methods ////
|
||||||
|
|
||||||
private StringBuffer characters;
|
private StringBuffer characters;
|
||||||
|
private StringBuffer rphCharacters;
|
||||||
private boolean tIsOpen;
|
private boolean tIsOpen;
|
||||||
|
private boolean inRPh;
|
||||||
|
|
||||||
public void startElement(String uri, String localName, String name,
|
public void startElement(String uri, String localName, String name,
|
||||||
Attributes attributes) throws SAXException {
|
Attributes attributes) throws SAXException {
|
||||||
@ -199,12 +224,15 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount);
|
if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount);
|
||||||
|
|
||||||
this.strings = new ArrayList<String>(this.uniqueCount);
|
this.strings = new ArrayList<String>(this.uniqueCount);
|
||||||
|
this.phoneticStrings = new HashMap<Integer, String>();
|
||||||
characters = new StringBuffer();
|
characters = new StringBuffer();
|
||||||
|
rphCharacters = new StringBuffer();
|
||||||
} else if ("si".equals(localName)) {
|
} else if ("si".equals(localName)) {
|
||||||
characters.setLength(0);
|
characters.setLength(0);
|
||||||
} else if ("t".equals(localName)) {
|
} else if ("t".equals(localName)) {
|
||||||
tIsOpen = true;
|
tIsOpen = true;
|
||||||
|
} else if ("rPh".equals(localName)) {
|
||||||
|
inRPh = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,8 +244,14 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
|
|
||||||
if ("si".equals(localName)) {
|
if ("si".equals(localName)) {
|
||||||
strings.add(characters.toString());
|
strings.add(characters.toString());
|
||||||
|
if (rphCharacters.length() > 0) {
|
||||||
|
phoneticStrings.put(strings.size()-1, rphCharacters.toString());
|
||||||
|
rphCharacters.setLength(0);
|
||||||
|
}
|
||||||
} else if ("t".equals(localName)) {
|
} else if ("t".equals(localName)) {
|
||||||
tIsOpen = false;
|
tIsOpen = false;
|
||||||
|
} else if ("rPh".equals(localName)) {
|
||||||
|
inRPh = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -226,8 +260,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler {
|
|||||||
*/
|
*/
|
||||||
public void characters(char[] ch, int start, int length)
|
public void characters(char[] ch, int start, int length)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
if (tIsOpen)
|
if (tIsOpen) {
|
||||||
|
if (inRPh) {
|
||||||
|
rphCharacters.append(ch, start, length);
|
||||||
|
} else {
|
||||||
characters.append(ch, start, length);
|
characters.append(ch, start, length);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,8 +19,11 @@
|
|||||||
|
|
||||||
package org.apache.poi.xssf.eventusermodel;
|
package org.apache.poi.xssf.eventusermodel;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||||
@ -29,10 +32,6 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|||||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests for {@link org.apache.poi.xssf.eventusermodel.XSSFReader}
|
* Tests for {@link org.apache.poi.xssf.eventusermodel.XSSFReader}
|
||||||
*/
|
*/
|
||||||
@ -60,6 +59,21 @@ public final class TestReadOnlySharedStringsTable extends TestCase {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPhoneticRuns() throws Exception {
|
||||||
|
OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx"));
|
||||||
|
List<PackagePart> parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml"));
|
||||||
|
assertEquals(1, parts.size());
|
||||||
|
|
||||||
|
ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0));
|
||||||
|
List<String> strings = rtbl.getItems();
|
||||||
|
assertEquals(49, strings.size());
|
||||||
|
|
||||||
|
assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0));
|
||||||
|
assertNull(rtbl.getPhoneticStringAt(0));
|
||||||
|
assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3));
|
||||||
|
assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3));
|
||||||
|
}
|
||||||
|
|
||||||
public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {
|
public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception {
|
||||||
XSSFWorkbook wb = new XSSFWorkbook(_ssTests.openResourceAsStream("noSharedStringTable.xlsx"));
|
XSSFWorkbook wb = new XSSFWorkbook(_ssTests.openResourceAsStream("noSharedStringTable.xlsx"));
|
||||||
OPCPackage pkg = wb.getPackage();
|
OPCPackage pkg = wb.getPackage();
|
||||||
|
@ -22,7 +22,6 @@ import java.util.regex.Matcher;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
@ -226,4 +225,18 @@ public class TestXSSFExcelExtractor extends TestCase {
|
|||||||
extractor.close();
|
extractor.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPhoneticRuns() throws Exception {
|
||||||
|
XSSFExcelExtractor extractor = getExtractor("51519.xlsx");
|
||||||
|
try {
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.contains("\u8C4A\u7530"));
|
||||||
|
//this shows up only as a phonetic run and should not appear
|
||||||
|
//in the extracted text
|
||||||
|
assertFalse(text.contains("\u30CB\u30DB\u30F3"));
|
||||||
|
} finally {
|
||||||
|
extractor.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
BIN
test-data/spreadsheet/51519.xlsx
Normal file
BIN
test-data/spreadsheet/51519.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user