Support for escaped unicode characters in Shared String Table, see bug #49653

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@979952 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2010-07-28 05:48:27 +00:00
parent 0cc825e8dc
commit f03598469a
3 changed files with 57 additions and 5 deletions

View File

@ -34,10 +34,11 @@
<changes>
<release version="3.7-beta2" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
<action dev="POI-DEVELOPERS" type="add">49593 - preserve leading and trailing white spaces in XWPFRun</action>
<action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action>
<action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
<action dev="POI-DEVELOPERS" type="fix">49593 - preserve leading and trailing white spaces in XWPFRun</action>
<action dev="POI-DEVELOPERS" type="add">49455 - Insert the content of fldSimple fields into the XWPFWordTextExtractor output</action>
<action dev="POI-DEVELOPERS" type="add">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
<action dev="POI-DEVELOPERS" type="fix">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
<action dev="POI-DEVELOPERS" type="add">49538 - Added implementation for POISSON()</action>
<action dev="POI-DEVELOPERS" type="add">49524 - Support for setting cell text to be vertically rotated, via style.setRotation(0xff)</action>
<action dev="POI-DEVELOPERS" type="fix">49609 - Case insensitive matching of OOXML part names</action>

View File

@ -18,6 +18,8 @@
package org.apache.poi.xssf.usermodel;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import javax.xml.namespace.QName;
@ -75,6 +77,8 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.STXstring;
* @author Yegor Kozlov
*/
public class XSSFRichTextString implements RichTextString {
private static final Pattern utfPtrn = Pattern.compile("_x([0-9A-F]{4})_");
private CTRst st;
private StylesTable styles;
@ -337,13 +341,13 @@ public class XSSFRichTextString implements RichTextString {
*/
public String getString() {
if(st.sizeOfRArray() == 0) {
return st.getT();
return utfDecode(st.getT());
}
StringBuffer buf = new StringBuffer();
for(CTRElt r : st.getRList()){
buf.append(r.getT());
}
return buf.toString();
return utfDecode(buf.toString());
}
/**
@ -490,4 +494,39 @@ public class XSSFRichTextString implements RichTextString {
c.dispose();
}
}
/**
* For all characters which cannot be represented in XML as defined by the XML 1.0 specification,
* the characters are escaped using the Unicode numerical character representation escape character
* format _xHHHH_, where H represents a hexadecimal character in the character's value.
* <p>
* Example: The Unicode character 0D is invalid in an XML 1.0 document,
* so it shall be escaped as <code>_x000D_</code>.
* </p>
* See section 3.18.9 in the OOXML spec.
*
* @param value the string to decode
* @return the decoded string
*/
static String utfDecode(String value){
if(value == null) return null;
StringBuffer buf = new StringBuffer();
Matcher m = utfPtrn.matcher(value);
int idx = 0;
while(m.find()) {
int pos = m.start();
if( pos > idx) {
buf.append(value.substring(idx, pos));
}
String code = m.group(1);
int icode = Integer.decode("0x" + code);
buf.append((char)icode);
idx = m.end();
}
buf.append(value.substring(idx));
return buf.toString();
}
}

View File

@ -130,4 +130,16 @@ public final class TestXSSFRichTextString extends TestCase {
assertEquals("<xml-fragment xml:space=\"preserve\"> Apache</xml-fragment>", xs.xmlText());
}
/**
* test that unicode representation_ xHHHH_ is properly processed
*/
public void testUtfDecode() {
CTRst st = CTRst.Factory.newInstance();
st.setT("abc_x000D_2ef_x000D_");
XSSFRichTextString rt = new XSSFRichTextString(st);
//_x000D_ is converted into carriage return
assertEquals("abc\r2ef\r", rt.getString());
}
}