From 5bb119560641d9bcdf6a6c79dd485fef33014500 Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Wed, 8 Mar 2017 13:41:07 +0000 Subject: [PATCH] 51519 -- allow users to ignore or include the (phonetic run) element in the ReadOnlySharedStringsTable used in the SAX/streaming xlsx reader. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1785965 13f79535-47bb-0310-9956-ffa450edef68 --- .../ReadOnlySharedStringsTable.java | 56 +++++++++++++++--- .../TestReadOnlySharedStringsTable.java | 26 ++++++-- .../extractor/TestXSSFExcelExtractor.java | 15 ++++- test-data/spreadsheet/51519.xlsx | Bin 0 -> 10210 bytes 4 files changed, 81 insertions(+), 16 deletions(-) create mode 100644 test-data/spreadsheet/51519.xlsx diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java index 4de27401d..47865a86e 100644 --- a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/ReadOnlySharedStringsTable.java @@ -18,13 +18,14 @@ package org.apache.poi.xssf.eventusermodel; import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML; +import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; - -import javax.xml.parsers.ParserConfigurationException; +import java.util.Map; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; @@ -95,6 +96,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { */ private List strings; + /** + * Map of phonetic strings (if they exist) indexed + * with the integer matching the index in strings + */ + private Map phoneticStrings; + /** * @param pkg The {@link OPCPackage} to use as basis for the shared-strings table. * @throws IOException If reading the data from the package fails. @@ -177,6 +184,22 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { return strings.get(idx); } + /** + * Return the phonetic string at a given index. + * Returns null if no phonetic string + * exists at that index. + * @param idx + * @return + */ + public String getPhoneticStringAt(int idx) { + //avoid an NPE. If the parser hasn't + //yet hit phoneticStrings could be null + if (phoneticStrings == null) { + return null; + } + return phoneticStrings.get(idx); + } + public List getItems() { return strings; } @@ -184,14 +207,16 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { //// ContentHandler methods //// private StringBuffer characters; + private StringBuffer rphCharacters; private boolean tIsOpen; + private boolean inRPh; public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if (uri != null && ! uri.equals(NS_SPREADSHEETML)) { return; } - + if ("sst".equals(localName)) { String count = attributes.getValue("count"); if(count != null) this.count = Integer.parseInt(count); @@ -199,12 +224,15 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { if(uniqueCount != null) this.uniqueCount = Integer.parseInt(uniqueCount); this.strings = new ArrayList(this.uniqueCount); - + this.phoneticStrings = new HashMap(); characters = new StringBuffer(); + rphCharacters = new StringBuffer(); } else if ("si".equals(localName)) { characters.setLength(0); } else if ("t".equals(localName)) { tIsOpen = true; + } else if ("rPh".equals(localName)) { + inRPh = true; } } @@ -213,11 +241,17 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { if (uri != null && ! uri.equals(NS_SPREADSHEETML)) { return; } - + if ("si".equals(localName)) { strings.add(characters.toString()); + if (rphCharacters.length() > 0) { + phoneticStrings.put(strings.size()-1, rphCharacters.toString()); + rphCharacters.setLength(0); + } } else if ("t".equals(localName)) { - tIsOpen = false; + tIsOpen = false; + } else if ("rPh".equals(localName)) { + inRPh = false; } } @@ -226,8 +260,12 @@ public class ReadOnlySharedStringsTable extends DefaultHandler { */ public void characters(char[] ch, int start, int length) throws SAXException { - if (tIsOpen) - characters.append(ch, start, length); + if (tIsOpen) { + if (inRPh) { + rphCharacters.append(ch, start, length); + } else { + characters.append(ch, start, length); + } + } } - } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java index 98b51b131..060c5a80d 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestReadOnlySharedStringsTable.java @@ -19,8 +19,11 @@ package org.apache.poi.xssf.eventusermodel; -import junit.framework.TestCase; +import java.io.IOException; +import java.util.List; +import java.util.regex.Pattern; +import junit.framework.TestCase; import org.apache.poi.POIDataSamples; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; @@ -29,10 +32,6 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst; import org.xml.sax.SAXException; -import java.io.IOException; -import java.util.List; -import java.util.regex.Pattern; - /** * Tests for {@link org.apache.poi.xssf.eventusermodel.XSSFReader} */ @@ -59,7 +58,22 @@ public final class TestReadOnlySharedStringsTable extends TestCase { } } - + + public void testPhoneticRuns() throws Exception { + OPCPackage pkg = OPCPackage.open(_ssTests.openResourceAsStream("51519.xlsx")); + List parts = pkg.getPartsByName(Pattern.compile("/xl/sharedStrings.xml")); + assertEquals(1, parts.size()); + + ReadOnlySharedStringsTable rtbl = new ReadOnlySharedStringsTable(parts.get(0)); + List strings = rtbl.getItems(); + assertEquals(49, strings.size()); + + assertEquals("\u30B3\u30E1\u30F3\u30C8", rtbl.getEntryAt(0)); + assertNull(rtbl.getPhoneticStringAt(0)); + assertEquals("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB", rtbl.getEntryAt(3)); + assertEquals("\u30CB\u30DB\u30F3", rtbl.getPhoneticStringAt(3)); + } + public void testEmptySSTOnPackageObtainedViaWorkbook() throws Exception { XSSFWorkbook wb = new XSSFWorkbook(_ssTests.openResourceAsStream("noSharedStringTable.xlsx")); OPCPackage pkg = wb.getPackage(); diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java index 89a4e4047..d82ac6205 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java @@ -22,7 +22,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import junit.framework.TestCase; - import org.apache.poi.POITextExtractor; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.extractor.ExcelExtractor; @@ -226,4 +225,18 @@ public class TestXSSFExcelExtractor extends TestCase { extractor.close(); } } + + public void testPhoneticRuns() throws Exception { + XSSFExcelExtractor extractor = getExtractor("51519.xlsx"); + try { + String text = extractor.getText(); + assertTrue(text.contains("\u8C4A\u7530")); + //this shows up only as a phonetic run and should not appear + //in the extracted text + assertFalse(text.contains("\u30CB\u30DB\u30F3")); + } finally { + extractor.close(); + } + + } } diff --git a/test-data/spreadsheet/51519.xlsx b/test-data/spreadsheet/51519.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ba56dc610e29dd3e189871e5f4e29866a4366bee GIT binary patch literal 10210 zcmeHtbx>Ssv;N>7BoJ(Hhu|(DXfntUG(kdex4|8PJHd5;;10o^;O-6+EJ$z-4&jpB z@4LI3-CK3*_vih6XKK!=sXG0fseYgCeoptRD2ITE4?qT>0ssIifbAne76=>w@DT|B zcnpBSX-V2xI~Z9z=%~6vjO?}9T&yf9vu5EKKLX$%?*Gs6f6D_yabSf`4xE4s&=;zc zCv?_f{y63fNYCX85P@xTu{}iwTI6-rbtW-m=!(Ir6G*92N<4=xSH^qW9$LwU2yiu3 zf}|!i3Gd?-%$kPgcLycihVjBXnDw9m(0#zXhm#kd=e%p1e-Z0uN%;6OB(l<9R(DfG z{aU47F$9#M*WVJwQh_K+98p&9z~>ubR>C`Hc`XI!`#S2WRPcu^YKm4~krM(>}8qKLoLew?4AZWF&$~9t|}%o5AfarE*f+yo+roZY}Q-|NKGxW#piDjKO8$ zEh+X>X}}o}VNsdkd6`bl&8FQPcpD!0JnD|PtXP;YhW}!B4*>wUzefNl{w4Nm)HrC) z9tb7>5Ly_35jY(?BTIXBwx4IptbzZ;-2a(a3y&Fn2s6x{Lr}Bi`pu3t2sJaeNgmWh zrAppLdq7bWkxeUhaqf@cD{BB7KmRuRTx;oI(@d2DsL!s@*~lnf!L9E;Ywl8-@f!Q1 zKDCJ%elJNRQ18v+lBJGIPF$+~>`g&sSQtrT+%1hh57{exjN?7K+-MG^0X@_4SmFYs zFL4{s0zZ>0vWiT%o1>z*qZEl=i(FOK@qley*f%}(6BWY6(jN;CI&DZAV-$|U3Q{Bn zW&8+Ep_91%H`E3Q>GTV4Uwyx?6|kk23mF5goQf=m+KcV(`mkk~7GAcDha}-ctLi)~+uymqgm%XW|4?V99wLDRfC}eg$^JXtoo(za^lfY`epZ@)Q6KRq^&js2 z|NCf;9X0=rG=439%2P^>~E#=&S5u~F4|8%Rbjz7+$*rJMY~NE1h)K2`U1zWcM8I`ix6|+S^Oz#zeIX@$lnQAY0~!g(}#3XmUpi62lglGR+4ra%5lJW_4>Bn zPYVYnNi>$2&%E<>)g%ZEVhi*_$Ba0Oxm!y)2fjjMKQLhGbzTU9F6tj+{1byTau{oL z9#~=fkS(zvsPVv{KXWHsS^Q@yYo<9y8FWsWveGO_QI&YVg+!&aP!+--q1GE|8C3n* zB&{G5_0f7|)6%h_am>b!I@1Z-j7+wQ5P5gp`;mw!w~UF4;mKiEx)FQUdM*-WH~5!t z=Fg@pdC!mZhI}Y}oS0Z!$_bU$Pw=L$lJtLDkRE=XixXq3tmy*pF zNO|G~oXcyq$w7zFrcW}G%C~A#e_V+;A+2^c?rolntI!Y&WUR&0yzpcSU$2kVOybKW z+S~q8wJnmFv(G2Yq|f zrC`!7>mYKo%#K$b2wKJ>dD(cqv673UDRKKPUG8(h^McnY}1V6~Z| zX=Az$o^!=;8cKI}u1ryJN$>=0Aozl)xkq*fE+fQ;9t(YMmmUcQ}h-1a?o7(T*pXEf2=oz;5dWOt3+ zHpNvoyX)v)15Jn1T}A6`|+YQcKqr4y)~|V>l&rfrc3y{GrDB0G=={lYrvRTAVEZQ3WK4qXYg0H$QQo)!vGt+ zw@DR}t#3&tnxnoWfmLBQ9PLrK7;fe1-uF9KD&KdmV%LI3BPo73+-{8@cT%vjnyj)# z`hD$4T=qKuHcOHHJl*T`C31WQdwH-sANZ|4%~sZbPP1MtpuGoDkeyee4Nm>)6vORN(GIL>L1xgIKL4) zW<+?GvFL0_=C7Y;iuR3EZyj(UU_!Pt3dMj_U0*T?qnqz;bE_`8&p1Ifk& zp6Wl3QbAP;KW||6JGAAO{4RV6*UPf#nTXfBm`~^{%mb0qL<~+(M%$NToKm=pFuir| zPEso9;X_^`SgWnW(o)fo-AiX|FNWN=cQpB#9E>nyn}O)ok-&bsgJ9u&P5wW!g? zS1)L3bp7O6u^p2g4?gkd;#0JLVJ$n!AUF54$CGGkcL*+eJt&XXzqh(xxL!=R3K^|T zG>aMNFW^Jov}s1|BQJx3NB6d4egrddV#Z}_GX5Dc-}mJ98HA8&athe0<1;83mK~Sz zAllFsw?TCo8Z>_XIzEoE;7(}BT(f}NzLd!(11n{i`CZ*t7=d{p}B zyxP87l?co|EOaYGI#q475liDMD7Gjk2x3GmS#^7kjK*YMX>6@#t%eRH zKlSvPo+azN#;vDB&{9)TW@X=X2%uh};% zTV(ZLGdStM9PK7_KAC60B$4;)b->|?($D``vT7qWU(!6thw+Cm(I4``-qgs*!JhrM z3)j!O6&~IGQ>zF5rqzYPy*y$nFf_B2mG}NQkWtP2keM!}ubeLK6AGzO^?SzO>Yl^A zi>B*Hy9j#>`)4^=WO25%c1jK{E?t(th`a}tc;q9WsCQ!1tMksjnfCe0MlAItc|`>) zQ1EH4e5rW8vN)XQYaZ!*f&7X@CMng$sdZ#mtpw)@rrlMLdXF4YPK}i&yLZshBpFF1D@gT$Vl!n{m7Ao5m6$GI` z^d3ij<*i98YB(JdSk)C)0EHk>t>SPXas(yOM%oT_hMVkrhIw%}r7j>sHSo|_n|8f_ z+@~Hs9|T5lGP zcu3U0GTk?;6A;_43a}Vn2SG&QQ*Jr&Wg{a=D@;$6JtjDg2P0glpMY06ZMoy1@-Bm$`gIZro}QhV>JA;4$S>CFhNnOfQ%0hCD}`G}Sp8-*(Jd7b8Uq9Ms;d7ax=Ar~ zRfz0SF7;}^Eu{=N{M{==>#pIcQG+%zOz93Pro$35ijmqM&lD#@cGrB?GibycIB$1@ z-fbC9LahpyC2@lNpA~w&G;fQ}Vj!blFoyM34PWb9wAdzizNxAlZTjwxgA7DX)JDC5 z?nBO-5rP?9=5c{`NX`P<-@db&unQUTifMIYS(qhGT8+H3V;(;1NZZ(B@>m^}%D3zN zLX3zd1S57L`(8n1v)mcE5{A9}g-|FLgUd>DnAE1Lss}ns^G$GUvoAbQ?JQ}{-!5gN8ok{9T#;TTh{A|x z%o)14W+^QwFHF+vf(Tu;;iqa*Z8E(S>N*3+e=ySjB)F`!>o3VZeAGd1k_t_Ti&K$x z0KHz#n=u6m!_6b0grc$9hH^z8he#s1vJZ333cByT+7Cih8su@BE153w(`N@3AfxVA zz5;d)lV}6e=5s&4$op99@Um#9yg`hFeLsB?(1N_~!dEJ`g;J&*hHO%58RoVct5NEP zWRmu}nc3@@ua~^IV}=+rT{#@Lvful@p)}J(W$Xf0n35Nbmge~a_DDQH(tfP$7%H;5 zE}E|b{W8ht+m^c#sh?j#JWLrKu#meR7@%KOVP3u|m{vtd9;_)7iEId0Xu~?Z5`mvu zRKucD+9Ma;ebaDWOmKuz2RTthv$-jJ{B9FExcYKZ2m8@yMa|$d@G(%zD)d@QUNQ@! zTW*Dg83vzzF72f`v|g@`fv*~t%bHF<^A#7d>x660bmoHq+ax=8A>EA7O0L%+B#bEqc5Lj1LtO!e4T1TJ$#T^pn}I^i$59T*n-#65mxo-D--E`(F)V3$puI~A|+tY1izgvItxGH_hY6?-~$!5xQr2rhR@WkHDvoC#AMA1JQ&B=haDYzG} z*CRY$X;ZEotXnrBPO<3Tlk!ICa=3MBkmVDO5Set_IEG=t4|_4w^c`n)AB4tBE_OT7 zESC(kn3c0ag>F`J*1V1Fr)uqO8hM36PK^2-{+VK zx@=3s&Q%_z^)^gXNG}eVe&>1*h}Qw41o8-iS@+KumQ?wT$Sd+&)u8v8E2>cg{i76G zpQQ`}H8aJhYFwTgCoyRs>qLyZ=W{*Cmu_YWj|9pa4I-K9 z%MQ%Oq>A@Y&BE3+Bt9*s_Q$r2zW+yI0{Q|zfjx*48bkm9>tAW!!PU~}xBOnMvRE|5 zfzwLp5ftVAsul_RrJ4J|6GVd5k`=33Fieb#so%lO6N%jud%3M{`~otuSKGvoBGqj6 zS>AZ_W|N_N%@9kNiK5ys#>8281b?b*d@TOmuix@1tM@6P&y1F%_ zaB}GxSZcVfm9acIENfZJ<|a@R(6&sUKAf4_$>rS|tUO~m`wdGepMB+~oI1NFWipe> z4;-toKKp{{6%U@!0&27Qg2T08?^TU0_+v8zYYPTRHB3Au*SSfbv|~N;0=tjmv<+c8 znBF~2r*T4?ZX~;#g?ub_VyG4eM92N689w#<7zXNsui1sEKu|kUNq&XXa(g zo~CFjLxvOQ?vsV@sXeC46O`{FdkKeFIW zl5|(WLvOSd^M915zqM7u6PIlkIBzr7u@vQl^5#$M2tIv(Ba$EpYDs50(iDG8;!{IB6CwgN= zB}+!RCEY@|Gz&+1r#}{Vs8he4Gy0=bN-UOjW-I3#;n!R#scHzvFev#|CW7OcE*F4V z=TpI!CeR|T(B3r-RU>J#1=PV!Z~eZz);^@fD>*uIBR`(3A^fAd-mg=EsncZuFa@J& zmlv66C7nqa$3n@_%ySG|vzBjSMlhQ!{02PXJ~5)LSgTx*nZOKPGfXNigO8HN4a_6d z1V#;yRGhHqudJNLM?XnV9j-qAynkU>d7O*pq@U3ew^Q8gf#1Dapu5(;rGV0Bgf?@; zUw~c!0wSxOn~Sn{E0DEA-~z>eOfYNQ?IZ}j+~wMSyE54zzj^PJ;-qjiFE~6bsx@Ae zFL8>sPpqM7Z=7CeS!KRX_g=!+(S5){>(d7QN#btzOZ5Oyqtv%yhsMy6sO;v)k7;|< z<)dAKq$9gz*k80wAc&Kryw~^Za3l0fesA?brg~}?mG*+SH<;riJp$2VKk*JaU6?_= z3USxeEQq?HeE2$>3bWt@5Xd&=Dp@PrR@<9&vHCQyob^+W=YH!f4s!K6peJqr!&3JX zBun>jm+LSkG&yoW^BZy9B140_ANiZ*vnJCrtLj2M!W1q@*u4K@@7aq-iZr6eF) zlA-l`he|Ebv~HaQh<#uA>=i(W?EhUHzV?xSFcY2(mmuEa6;23wq z)Lq*xPR;rV&jvrJ{c|rIzWR*KqwN4GEny;MU;JWPGhlHYME3-1mBUX`$KU_TkBdGY zR3UpZP~jG4;xbTaJrrX{`{`gK$dFxn$N$oIv}C=%L_)_Z*Y{=*R!4KIfCDiAepreB zB3HQ7x<%@AKWK1D_)qUhev+YV`VTqY@}YN)`!M}5v@uY$vw_&N8`#(x{nYW4S^pV& zJmf_{c+8&$(xSFszN3`T#seIs>1abk=QPfFB0$hSCz+;P=T+2_dN8|hY&8|Z(>e9Z zt?ghgc%i$ln8|&r7M+kvecFtr+s-;Hdd^!QGh?{zF4-Ihb!($%!x$1 zAG53?J<`5<%v4+=t|qC9ayYM!^sySg*rE=nD6djnJiqAN7O8>17jt1yMi3J2Zw>fJlK+*g|fy9c@@6Hgjj zU$n}Kf7$zrZdj?u)%BS!%$nZpiV~O_Q5y@5GlIC+C1tv-#J173$^~L@a=m^=@q2j|bk)Pq3& z!^@wH{QZE&ioaTW*eG!bx}l(6E}ZnmDFV_Uh6y~)Qr0@RgvuDKE<~FIYkED4mCD_& zbf1UzoTjIXeqh)!t~=AfK-0zyNK&wi7XMV?pr;aiwzGPxX$Yc^>*)mq%pru5-mS^A zDqB)UBMb>p>r$C#8MDzXkkleGJVkk3lvEpdnZ(ESI=%`*v48UfzF#9IdcZMh9tGs#-HYVo*p1-*hD(nygojpOW6C6D2Lqp`BcQE~X!Tx>y zhy6^7a(@N*>o$yk8~(g{JVeExHf8*3_}7{CpAAhPW@G>Jg!@;VUniS?AQe4q()wl2 z`K$4-qqske`OyDyF!w9Kul>6}0Jd@d`{{pclm72fB=G(b<+q;TuPDE^y#DwvsSi!C z2T%O10ro4xuWwU-Aovphj_}{_R==A5D#8CS4SpDN{%aZj?1