From e52778847c7bafc17a9673bbce7348f985ebd311 Mon Sep 17 00:00:00 2001 From: Maxim Valyanskiy Date: Wed, 4 Aug 2010 12:43:58 +0000 Subject: [PATCH] hwpf: append any character data before paragraphs to first paragraph (workaround for bug#48075) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@982238 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/hwpf/usermodel/Paragraph.java | 8 ++++++++ .../org/apache/poi/hwpf/usermodel/Range.java | 6 +++++- .../poi/hwpf/extractor/TestWordExtractor.java | 10 ++++++++++ test-data/document/MBD001D0B89.doc | Bin 0 -> 35840 bytes 4 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 test-data/document/MBD001D0B89.doc diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Paragraph.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Paragraph.java index a1462ffe0..087c6c5c1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Paragraph.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Paragraph.java @@ -101,6 +101,14 @@ public class Paragraph extends Range implements Cloneable { _istd = papx.getIstd(); } + protected Paragraph(PAPX papx, Range parent, int start) + { + super(Math.max(parent._start, start), Math.min(parent._end, papx.getEnd()), parent); + _props = papx.getParagraphProperties(_doc.getStyleSheet()); + _papx = papx.getSprmBuf(); + _istd = papx.getIstd(); + } + public short getStyleIndex() { return _istd; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 7c9b541d7..df9cb0c46 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -830,7 +830,11 @@ public class Range { // TODO -instantiable superclass if (props.getIlfo() > 0) { pap = new ListEntry(papx, this, _doc.getListTables()); } else { - pap = new Paragraph(papx, this); + if (((index + _parStart)==0) && papx.getStart()>0) { + pap = new Paragraph(papx, this, 0); + } else { + pap = new Paragraph(papx, this); + } } return pap; diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 22eaf0ade..ea69824da 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -298,4 +298,14 @@ public final class TestWordExtractor extends TestCase { assertTrue(text.contains("\u0425\u0425\u0425\u0425\u0425")); assertTrue(text.contains("\u0423\u0423\u0423\u0423\u0423")); } + + public void testFirstParagraphFix() throws Exception { + extractor = new WordExtractor( + POIDataSamples.getDocumentInstance().openResourceAsStream("MBD001D0B89.doc") + ); + + String text = extractor.getText(); + + assertTrue(text.startsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435")); + } } diff --git a/test-data/document/MBD001D0B89.doc b/test-data/document/MBD001D0B89.doc new file mode 100644 index 0000000000000000000000000000000000000000..386b5305be0e58421d03606390392dec480e8c0b GIT binary patch literal 35840 zcmeHQ2V7Lg)}LMO(#wi~G<8KpZ1g5b7euhdny67hnu?%+EjCm%s8OuZSh2U*5?kzD zv1=4HYQ&Be?3%pD_doY?VSxqF_}=%-WB8q&xl_)ZIp@sGnYlBUGpDO$FWP8+gOzYz zOpU$E*JA2Q@e+i6#b-HVfe2IFyZrooif#%(u+n`-9C);BA5%*);~2{`+(SZS43dS@ zWX#2aF?|-%JEHf)qYsbrvBiGOI1^UC31j0I3(#DvSaPX{{CtDLxU#}xhOO);L&jot zglEwgQ&ss@bQv>Jl~6|9c;L21+FJOLN_o!fGj^BM`fob(8jMLl0>^gn{|)N5MERvC zPi?RD5qLgzk-ebj3XsmN$(Ttw$Tz+xy_}-|mZUn+i-{1#2FP{&#|Y#-R2@NkP2ZD; zlCLA5CwwynW7-r^;A$%by>*B0AV$KE!Lyyx^GrPV#&bUW57Bca6e@#Np%^QfN;xn${5rvRU>=OhE0`nyV@x@F zJlEtDc&dtDIcJH#27I}iz$Uc8m8UR_(?Cgoo_|gSH2NZka$CbtQgwSY82y1f@V*>`U%>5rS z92zI+oCdQ7=He2ez9&e;fop`)%DPM1Ofn{&cSViLQWm92y{br?1Z^ie6u{&TEpSC@ z$(HzV)OLUHmwG1gSIF;#+}=tlL>u+90N@IM{!kyL-bL;ADi|542a7$AXcbCOtkAog zpcVxaN;vajk`VP;Y8TZQz?Y}KMC~QLCfPf1q`jmKq{+Uhk7#h^Yi-Om#E3(6&@WL* z`a|Oig{=$5AsVCTkH!t+u%ckxqPU_o(lSTBY-3)2wNSSY>MhB$q7kViuY~;6%94^0 z%aA*&Eoy_}*!RIMN$0zCLjj6GlQoPn?nFz=af2MEXGTQ63|U_&}OQGYa)g zk}YY96XfZ|wL+^L_&G(;DqkL>KjMPHd79s-W&S+J2n*?H6~uTKXui-!qE^sFl6Y}` z2rY8r=PT+J0uFIR5_==*sG#YB=6Jl9PgLUH%_o5-k`Ijw#rvc$_~VaJj&zJ9C(hZV zPp!cxK_i9vx*E^Jl3G$!M&P1r!K_bsMcD`%M|w(QB+bXfpCSu_#wn8AhguNCYZ@g4 zP4t7_2_qAYfz$$$b=jktvX+s=l}DP!{OCgclX{XJdb!4DUo}KfWkf(fBLc$uz$B@#7Hb zJdHh~rBHJARh~CU64VPx9v@d;pv{prR!Q66$4*)<{MCf6&>Dey5NSx!*eF`SKF}i? z^+gMzqz;MsRLv>o;1{h;yul69SPf>(&;BB($zJv;7~h4lTX}@7)bhVScaa@Ynvcq} zsbIa)m@Qg$BxwhJ?xdO18*_Ne}O+DEof)z&Gpq3ZS&9ov`u;!2` z&4X9=e`57T^5WE&F3vR2_V9*|6hwK`d2#2Ktmsz! zI)T=;0)DcrX*WgL))ZDWtzpS}@;$g{eMsXi;ir+cXupAGQ)N3xRUV>2;2FvEV|E-q zT`lGBIaI73(vphpI@n?N=pW_qw#gh|3Gf572Yd;L07L?!0nY%h0H#$Ks}HaPi~ytn zMgvj-ivi04>UeASuQxC99_8G)bo#*djVou29}v^E6}>7h#~GQ#K5jRYxoVCI43)F4 znxn$uF>~YCA3Kg_D$_o^gpF}*M&F#`*jk6Gu`d9qu^T+TRwFUrz#+R>V5kvd@w%n3 zG{qLssj9TxB=)rZQz3^b;bn*^EK?X$B2P8r*hO(?L7PygrN9D9Rz+3JIok!Gt1yqQ zSeD|Mx=HMWMjLs*tmnmDRu!fjk-W44ae#h+rGVXl-vNrMj9CG?0=fZi1MUJWjTx%} zNCpf9JiB-K)ZWc2=1ojVh-??)-jD>Xnv06!yp=PNh4111e{ng|+d?h*v>N;2I{v{W zp9ybI!p+dWncEkB(P5>7`?(JWX9^AWg^TyT77tWpW9?^g1$rxpg zT%H$<&@{@@6Dy46pFSoQWmb&9m@1b2SgrfWa{6@Ihvt59uBUmP=Jx=AILE(MmR+%j zT;-H|$`7ZN9}buK&`CL^0^xYZ`tXN!OsplB(B`75QK8~rmNHk`|NHwz@m~=+| zNOgBbHa9|zL$WfYaN2TurUP1HyjB*+q)cjxL9O zOT(?DF_FA^pefpLOCfJf%=z(vodBBq-wK=%ejoqc2j%jW;eS5|a3rIIHpN7ev4Rtw zg(}9>MH{aoCzyDe?@|vuhM>wGcvhmBmB!Sm2X4{+R{o_)y1Ba|LuF6AFQtA%PV9*( zoUNJ=^S-z&ol;Bo|DrfmBLQ(uq_~)GkyU1r zrI17=OG5Rslgb=PniSPrL~DgZ60+|D0HTdQ3}LeKi}n3H;45_F3quHuQGWPF`Qg7f zTzZ^U?h`Sj;BGi5SW>|?W{`Fl&Mzs@YT~`9z2vK&U-;RDj{KE3rw}U2^2z2DLdID6 zZDlfXO=p7HkJfdwmE|@~`JtHHl@tD9;qp1gYsVidwWGPR_3?RV2U8Q*@&S-Nt>-J{ zUjg~=h7DJGbl9u**^LfNP29twJpfwoSIWNv@*mBqlK;}rZ+2j6;vP@1J!3Zkw9iva z2hx>?_hJwK&iso72)Ovi~I5 zYml%WGGQfB6b~z`LQ>WhZ9wuZ_{B@%62Y?Yk+_>*z4&99s{`EG5v@)N6+lsl6Zdns zL+-SvTdeO_+JB5;FGX2r`2O#pQM&*08&I8K0w}2rEqqI=Ooj@T$>2(>jH>14Csz=r zChp^Jg6ze8;Ieg~m_jswHkILcF8zvqveH&kvkxC7E82fF#s2e&*HES8Z=kYeOFl#U zpQC;MGXQb#pZ4{Ot^JEV6qEnhGKBxRM(fH8c3CIzyR5XkD(s^2+B^%kKxu8h{`tvV zS(_PW%XFt4lvdktNNNhGx#3$~#fHQFwKhzUS#5Ba)}9h8v7QpoIl`yTRq6g|9O%YX zXNI`NuOl&_mj#usk^_|-sN}#W<3IpjPd?3;vH*tZwZLiM{-q<%Z3<@&8)GvC`>D7? z9~f%Nx`#-(aQJ;g%(x^#G9VP=lLUD}m2l`Vx?4hbdxSei!kKeN;L%53y!cYLmiXC_ z^{xxNwm4|S;iMPGS^y8>YVlrNFo^LH3@PIgNvaA*x`fkUeM7WBoEFEr;wveHt?PTS zIRl#kaswOxMpwf8Y^2PRum;rE3Jf*jkC_qTezACuzcLwN8Bi-f*2GBXkYRKilpUoFTjw)kS)YTE^#j`*^ zNL3QAguzS&ujzcUCx2#HI1ek_mkQudQPUj-eCq+nE7Yvj^bSLj+@%w+ZRBpvJglLX zYM_Y@@zLSJCagM|XDDf+#6w+XRSdguI-2fz66IC+(t3Sc!k~_tYnh4JL#c7&%DN2;cVT z0M7N7cq-x1!R=KK{tUYx^u!-yo)>@C8z+YtnjBDHO1(XwMYDFGVYDP#sqDwKnZ_A8 zqzv9xNj)U(0pNEqz+)Un?Z`TU#;KByO7)l(oN+ICBU9+P&gIju+b=eSrC?jt0$xNX@N!1a@)trCszlLxh0lUWpKlz}j+ zO3`3}y`Yf#$bLzR63&|F5k-0}hAKWHk_;`GVWb?pYDP;l5GE}UTSJ;9P7}!()uonv z$HXUS(1GbgN!Vn;E&}fx@=177L;McBD1NJtJUEa84}+BOMJ7D*CH5lU6y%!bW1o0Io{{@~@VK4A}7{s}-6I_zY4(NduG=&Q_{JZPZzUEf7U%79xt$1fnQS zLL^~1c~nX|O{}#D4Yl7O)m5iTYKwfH09PH9bJgjAXF4w;5WuFCRZ^y3SfflIC1v^o z>1qWfxiU*$kQbJ`AVKY zw}L+iep}3)eOWk*M?P2JkusS~B9&;UOJ(8`SxGT5xWKi;T-%pLpky4LobCA9b<{a2 zxiV|_XNjmkoG;_`!DVo}i+u%Z8S}MERYPlav^lASuaDTIiXECrx1V0Jc`~~@NGE}Ip zCgpOaEaM>&Rp3O515qRKRp66?wZwBWXu!<`#Y!e$b$_c$uj>#nw#c7HO z!gNAFZr^tQ&O+$Jev1@Z^^O(C&!*J7MWD%36?|B|AD)}){7*V@M5}0>ADW-=iaf1o z9sdjYoUoCO>5A=z-E#(z4ag%K2m_5Apa9Ui)DO@G&;!s9kOG(jD61UdwS~H0!pM~a zP$2Ke;Q3!K=KME4m!c*nf7%D&H?ZU)Y&q_~MH47a{06hM2IPqVppsxYZmkjB<7Bu(L1+^Afg+~A)0Rx!~dy7$6@kL<; zsLEk`EYE{aP|qW4ObSyK6Fk4SfQg^?>DiCfcuH$K;Y;=TEF~p{;VPpX`)ks?X-JSW z%{(%5`9dHDFKhA~@NB@iJX=0qOXcEF%Cz}%3}vvt&*z5;fqge2UP2ASnxGxi$)nSF zdr*O#9j9veO$bfKsnOcVS54S0tCB|rC^?_sqtL*%ay%C4@mO{b;R2_Osw zHL!E1QIKCaoPeUmoDSMA2P^_C2b=}qJR*AzSc}R8Eat$X4EfjwD{{0WNcqT#v=O&N zC(tUeS1K|Y8>2;G!e$8Wm~qdd+jq5qPhm4a$*_QvmQiGatOx7`>;s$wOhxs(0nY){ zvUC8o=rH)9g@wKhWZ46RR>{z+@_-CM6y#NgHgqKb4c6|94DaJQ(`hFMZmw_5W}Jf{;$907uaIPkkom>OpUIZ(@MvC$KDR{aAZOGvdU z{4RkB@ySu~$=%xz=@*sswcX&paX(Hy-ur-qJm~eFte;dZ(wlXk}g>^k`d~?V62}Ys(`}{V=q{y7XHa`li#Lx4++^TkP1#;kQ<ihFy(A~)&`@Vh5Q#U z(Qj({X|PE5+uX6c#+)7g#km8fbq3lmn-XZhWH`HFJ2KYF;)=z?E3J>HYwyrZU7GuP zmG`@K6W`vLaZh0~XSMo)n}NQ^V{Z)f-ZOG^=Gdhx>S%Tu+;7~Hp;tC{-`VEuk|rk| zEtZ9BahPdnkaoHC&bI3UBF|>Z&xHnEafz@VxngSn?iXv$USO8(*5cQfCT%ae_gI&{ zGBf|@0}dw^wB9#zeZxCJT)g{?7QJpqELv%QtKF_CE2CDQuW@GcwzpY-scdMUA>W&f zK?5fM=uJg(Y*gPUd!^r&;ql=yQHi7+x+hv{A5h5sZVX>EXv$^xo;z%VB7Qo)ZpSMY zIM3YwNXEH^S1#VUHOBqab)yS&!}PvtI8Q@wLexpSw5@r=8m_Qd`CEwPp@wlcju_v) zKiD*L)1<1`n;%X3VtDNfQ^u~{d`*$Hxxs~6xec#%`^CDq_rzwMa(;bX?`F`$1~%GK z57oqH>@+>R8KcY?yk{{{MZ6=C{phRH@fJa*dul(`U$SfK#;RVEmUwh{bV+YhuO*in zJ?Ou)Ms6$T9Y4=8tNU_oXnKgEslVk)uB(Sa(RSjtO}W`Vff;67v z>7V&wq3$~#>x9*qrhR1i ziNP%%{GJw)`|QDv+#j=}jHjh3GScrjtsAg?1QR@=?Jd;LbIf@UACF`+*fcjWqaBh7uyYGzoBk-Qwgb;sc+T<4 zOxrzTi?5HVZ*|`qHJj|vxV~`zy(8nc>}i)}lxRJ-MaI`B9@I#C@_d)wFKe0{+|WL6 z@spV+7P@cU?)2@fc2>(LFFJX4^U3a^vkYGhx;v>wl@~Sv#~+72KKDoJ9*KkN#6Ful zpX@kf|7o@%cg&rRvznX^9lGl3(ZP59UOjA>8@;OY>TPc<^T(Ra?=vZQN$0Qu8(Zid z9kKRA_N>r}GdB;>Nboc1K7U^)^FAlwTddkBo5(NEny&gHK>gCUO%rae ztZSPked@ID7x#ggvz)^^yG`4jF@E*X+`6U{qL1a?nV#F-XGp64>zXy6<7e9Un>u+X!S@-F{VLuPiarJ2M(>L=UZ4X(g zeL{OiuHoHnPqk~*w=~}P=w{lT>Lc{lYzWCb74l|C*Rc9&$$qBI*7{8SDedk!HZ;k9 z&xA!|jK>Y1KFO+DJ)>*&ZVu5)TF`9W^ZC(fOO|S0S=4{_k==C%DS{k-ySZ9(cjh(i zv!~@6mNhM&beZ9C!{--^t4l{MIp^a4V(y|d8&k6$CugS}9(i%H+1L?rN!kDU&CzA= zz3x5@kN)Vsxyt%|3);8WvS{{OZvSmdT5in@TDbeI_n_(qw;D-%`RsVQv+pknUDmx` znDEPy3(tF5Jl?c5;L3%hG1aG?JZd@rn}`~14tw@Kc+|DmxM;0EgZJo7oRi+T&9uz% z^|fyF`R!iyu8Iwd{70`bSgGw*t5uI|yPU!L>vu{&LBj2jr0KB>d- zj17PJM{LggbKnqv`qmDveVom`zdqy8enaa6dzQ=I zoYiY&rkl3@*}hY2W3OL6vT5eCMOKTezIxpzUGI?FxV(q$WkdgHzrVxS`V4xoYVMe^yCT9)k zqz)Kh(DnS#9p46o4EVL>Ikz7Dnms=~yj6ph)pBap?Vixe;D<-Mwy3|mpLxW@+4Rn< zaOX3T?-s87a>JU=-vlmrylY6{i;4GVcgS2VO}(>abmz{m*00=ewzcz>jnNGktdBnC zZxqvL+@Var8GHXSIG5ief64xsSF?0?EgJaN?L|kkjlWv^RMwpNHowOtzl!uoI@Eac;032zc|Usd+q&ux zd#61*YdCh9|G*87qa62)zB6*!&0SvdoB`W%YdY0w>9jAQmGnT)Wyb;RP(mH^a~Jkt z8Y^v%=$)Oia0^l_Fo)MIG0_R82|r^i&D8~_N=xni4o0c3M-S3WNceTTe5%d0v@>hZ ztEJv{)pPT;{^HSNy>$&W?Y<4){X?%Ev*u@3?dkNpW9({kP0#x?`_I|eciW@Mer-)! zjlb*_xuM7C$qDX{jbo;U*|xN?Zfk4jI6G0lZs_!RXC}0No3hShNqWcIR!f%qAKh^# zc=@Mxp>lNPzEh5iK5uFehB1DKG zD|??ZuPqQ;D_tcADmhTefl3Zka-fm}l^m$#KqUt%IZ(-gN)A+VpppasY!1*}hXnq1 zU=R)$B;%CDkPm9G36m9m83Ow>6>|Q_l|mNOR5{G&}#_6MeYKpkQQV|?Ak?Eejkh3X&pCP;= z#Z>0eM*U*l6>UF0@j_=%cqhZb6yiGFcOls~2lu0SnNUnJZfqnJ96Ay5m;ALYsGp9$ zRJ4A=ai&07WThnkVf1)>MEmKOL5qqiD-m0cBi;K@{+EO9gOmPl{qN9YaJi{ey~LjE zu?wHQZ5=CLGgXr{sQdF}Iu11-35M_~eKvqf;@^%i>I0Vm)DLd}NFSa8Xcw0bbKqnS zQ$r!*eKq)WmZ%PV+Sj#*Pp22W;Nun}YX%>;7+E{`bPT9Fe7Xi11E1730Y1?%2tFMU znF*ihxW&-{D>^cBEbq?koM%sNWKUczl#m$7=RzVmWnHXDxIFnwFPywE&(1L`srgibe9|n!PL94MK{<#mq)8{U@A0zcGZJa6269l~a+_)7QB;6Qour(sZ){Ao<4 z{!BfS`Z0~g)Q{;fWks%-Q?Df*QoyeTumaQuSOe++Yyfouwt#wo`hW(2hJZ!@I{=wa w4gfkxNPXNH-~w<3xB&=1oeORZ@C0}P$mFB*m_7hX_bc!N3Vezq_+N7W2P_{Jn*aa+ literal 0 HcmV?d00001