From f9961331ff0417ce2a27c36ed1092eec3958598c Mon Sep 17 00:00:00 2001 From: Dominik Stadler Date: Sun, 22 Mar 2015 13:33:43 +0000 Subject: [PATCH] Bug 47304: use fixed encoding when extracting text in WordDocument git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668367 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/stress/HWPFFileHandler.java | 48 +++++++++++++++++- .../poi/hdf/extractor/WordDocument.java | 2 +- .../poi/hdf/extractor/TestWordDocument.java | 36 +++++++++++++ test-data/document/47304.doc | Bin 0 -> 22016 bytes 4 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 test-data/document/47304.doc diff --git a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java index 1b6d4646c..5f24337fb 100644 --- a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java @@ -18,12 +18,21 @@ package org.apache.poi.stress; import static org.junit.Assert.assertNotNull; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import org.apache.poi.hdf.extractor.WordDocument; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.junit.Test; +@SuppressWarnings("deprecation") public class HWPFFileHandler extends POIFSFileHandler { @Override public void handleFile(InputStream stream) throws Exception { @@ -33,16 +42,53 @@ public class HWPFFileHandler extends POIFSFileHandler { assertNotNull(doc.getEndnotes()); handlePOIDocument(doc); + + // fails for many documents, but is deprecated anyway... + // handleWordDocument(doc); } + protected void handleWordDocument(HWPFDocument doc) throws IOException { + ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + doc.write(outStream); + + WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray())); + + StringWriter docTextWriter = new StringWriter(); + PrintWriter out = new PrintWriter(docTextWriter); + try { + wordDoc.writeAllText(out); + } finally { + out.close(); + } + docTextWriter.close(); + } + + + // a test-case to test this locally without executing the full TestAllFiles @Test public void test() throws Exception { - InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc"); + File file = new File("test-data/document/47304.doc"); + + InputStream stream = new FileInputStream(file); try { handleFile(stream); } finally { stream.close(); } + + handleExtracting(file); + + stream = new FileInputStream(file); + try { + WordExtractor extractor = new WordExtractor(stream); + try { + assertNotNull(extractor.getText()); + } finally { + extractor.close(); + } + } finally { + stream.close(); + } } } \ No newline at end of file diff --git a/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java b/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java index 929de311b..ff5330032 100644 --- a/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java +++ b/src/scratchpad/src/org/apache/poi/hdf/extractor/WordDocument.java @@ -177,7 +177,7 @@ public final class WordDocument { } else { - String sText = new String(_header, start, end-start); + String sText = new String(_header, start, end-start, "windows-1252"); out.write(sText); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java b/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java index 1cf29f437..f0941674f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hdf/extractor/TestWordDocument.java @@ -17,6 +17,15 @@ package org.apache.poi.hdf.extractor; +import static org.junit.Assert.*; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.junit.Test; @@ -31,4 +40,31 @@ public class TestWordDocument { //WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"}); WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"}); } + + @SuppressWarnings("deprecation") + @Test + public void test47304() throws IOException { + HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc"); + assertNotNull(doc); + + WordExtractor extractor = new WordExtractor(doc); + String text = extractor.getText(); + //System.out.println(text); + assertTrue("Had: " + text, text.contains("Just a \u201Ctest\u201D")); + extractor.close(); + + WordDocument wordDoc = new WordDocument("test-data/document/47304.doc"); + + StringWriter docTextWriter = new StringWriter(); + PrintWriter out = new PrintWriter(docTextWriter); + try { + wordDoc.writeAllText(out); + } finally { + out.close(); + } + docTextWriter.close(); + + //System.out.println(docTextWriter.toString()); + assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just a \u201Ctest\u201D")); + } } diff --git a/test-data/document/47304.doc b/test-data/document/47304.doc new file mode 100644 index 0000000000000000000000000000000000000000..d59d8d7ee1e8768039ca509bc1132f86ba260f1e GIT binary patch literal 22016 zcmeHP2|Sfq8$b76%OxUPmX_;MX_9Lx3@KU4WKBg2>EdR0Z|+5EipVrA2Bj1xOp1(T zl$Jr!GNmYOBzsMh8dJ1Mi~Bw2@|H9v^Zn+VZM<=JoLTe9RsPfT%G1jV#cfBv)lXM)W4E)pV2NOw3wmFc~ZdoTCH1fMVSmN{6Q zC~#B|$rJ5OW6GBAjsF%^AgTk6=_`NlH>ebpdjk2G66He`5qbz|eqX(h(OZ2=2U2B_ zLk2mjKyer<^-@CU8>lF%g3zqKdVL}Ua)=9I2SP8wCn&-1PobMp6SNmoqCKzs+E2(M zC~@C}Je;nF1OplaO+-Y!f%G=WlZ5<%at5ZcefkGNPJiA|;>7)ZOdcVhXa_-uqT|=f z5b_4*apKmuH{7Yf6Qdvy5QHy`}!R zD`6ijf7Bl*_D`2YjE{lz_cnx>A29WX%0#~sa{A|E-oSjK+&_)txp^Z@pi^P`H-S+v zj-*I3NE%}Jts)G6Z41l^IGDBIzS^KLsSzmt^lEt1!u_00I7sC*r5@Ie~<;f5|Kf2;O;BHnvD#5KKKv& zhcG3eoq*jRfgx-VeGFkRYCeX+WK4&;gXR4%_&IRDgk0?967j+4u_2ThSl1VxJR873 z{=YF@0K;vl_pJ|OO%?2226d_dEr1RH-p-H(&;YssbAS_I8Gr+b0we$itB2)k!8p4K zebNS?LD~OC&cD5$G1zV);01sR_EZMA0M-I%&`pkWD98yg7vNz-5(%=A5D5lw0XzT{ zLFUkHoqRG-DcCF#ELC6++5xdq2ns?>6bNNN98yM>bCd;Am_V_l{!*YozhH%C(jN0O zkrk49>)j|piiMVg#88mQgZ4Q=JOtn)xN?v$@MeMDV93S68Tp|n{6j#cN(tdoXg<{N z1PLrC{eCM)p^?vFa89WaLnKK`U~fbeI84kYqhTOK3kA@g^2_l~`rDR?3*^v{9q8i< zbdv@9lQ5=jDIyAB4LmoiBipZ^E5pQ%Mu9o-R^<{)b})|>%=4HZ!C`UTbyfxi<|SVA zKdmck-4)vtQ+?!S$oE=%Z(jN)-hJ%mrxdY>z|FTb)a@-Eca1h3o+M{6vXPp#>v&hg z+~ZC0F}Ij@m*(wtjmT8o`Ydiw{?+u<U|#^nWT7X{3&Vzg8qJ#gse z%Hq+Y2I=G2N|z1tMr`1bo^L5HjB^`*&wPKl`q!g+k{&49)Tr<$)YHSrx=)&Gn!y}= zJ`A?^47Rr>`(7F37=qET@jXIVfn0_TzOIkI42F{0@F=!4=E|D6S5GUC3uEj}uw?96 zgIcvBed%iTYCqRIo)eQSrmoL=)V1HNCx3HyYtlWMQcAAa>D!hwF8a2HnVpP`u87Ub z9!FiWlJmu$)%8UlB~DlNOsmpU+iP<~ce|X-mvxRM&iNMJS1M$$&atka;;9*#ow&lI zcI?jGDlLX~*Pbak*BUL$kI$*#A3CjDx!bWk@}TxzYms21Bs;$!Ju`C{cjlEQ(A=BfNgVdub93^86(y$69Sn*OrKn8klrEaBscAKHRL+?=3i*>Fx#Y)wWNW9a zwW3Xx?fK!iSKLWLic>b!n7&%gZhBQZw}WOL`%DW|q%XN}mhQ7S*^#H&jSN-{mS%8ESR-%u7ubNQrx;lc!UDX)lKKAN%N z6dk#|mbp?R>FeUn*{76tD2;R-DQ9;L?KLJ99TlUf%3Z(qNlcYRL~6NLZq^v;t~UQ0 zXA*U9wq@)a5u&}sF2k}?l~gI;xo6C$E+o%|pRBw>RjTqRD2NCpv46C%t4cUQkIM$Uk1sb~6aF^scQwnGNJnu!(*&8oww#FFIHhb(kRhzoxKuxY7 z@%5?j{JeuT?j@wNTlR_1^BrC7Frjv-g;?4zD_aAQ$CAocc?ECFUGA$kksNR2x!HD@ z>6oBu^Dh+b6IFe@qb$$gR$6|Tk+U1IW!)mYHduT*HS3;*zQUAwaV;@YmezJpOFw_! zc+tnPy5ddtgs#YG5gjc*_Q<#-Y81s>V=j1b)+oKFvrXgat8E88roC>6PhRkv(H!~u ziDP>AI;ZLK+})9#FKV;n(<8ep*v-?QmK_y&=-O(QBD*notH$BW@|?f!b@9k29tvg{41jixyjyDtA6 z6w*e|{6zPOatAOm-qC!*ES?hg?ai*cq%5sjtKGU=iW?5d zzL8o{cE6I-oOwo~tFE!daJ0#IdpWCw8Nc=B?U& z^oql(Nr&A_OTSC?JY?;#{9%H4(s%i)1|gOy_J!+1=?}^>b{J$^I`7_DsJ!;4HI3!c zqK5i5oM-9_Rv7)D5?Bzh&P6}z_=8L$b&t2ZO+5C-WZR9ybGesY4EF7ExmW4(y_8;Bn6#L! zA(Illte&iW@Km>(qqMkurA=1E(E#=cCHdHs6ReKc)+8#=KJ&$SmHdK52H#C`KJO-L zJ}1H=e7d1+P1NB-ak_qbrjb8s9(U4JUzFUq-`~@(=*YrNU#XVAyl{0+lttRqXws<9 zT79<8tj=mGWE!qt{k4-sYLQRPHxsXJJoj@?UT*p1(_5lfTrie*4)Ri|S59$_$XS@{ zQ}~>5;Cg7G`xc7fT#wV6?oPEyjI5HR7O5@PS59zR5aX_LrJ?RziTvY4`ZsE`XXjj= zsGWt_pRgStG?YRA#u7)c)Y_KWh?`ddTZJemO$)*@ji|&IgRLHze6+ zA6OT;>W=4Tl{Ay0MG z!r}nanxu2KlVVrMp7o1o7BpP;=D5c^ol(#;cZB{c=1fM8*$J6a#qx0S#j(+OC$frF zy%T~Hp1Y2HZWdNOewzzxe^vPOlp@dXPPD}NfBI-*VG--enOAAv$*Ef5CfQ1it?g;d z4FRTx+x)+5zb_Y?IdjEuy>)sXS-E55R`SP)tWwLrH@kA}tgR~3cA1=izP=)Vdi)RF zj^lrc4mjkH&YZG!?p!aSS$d&;X6N>G ztE4Yn@BDV+#a-mb`{ZS&TT4pK-j;v=WI@|aJEMuk-4o#T{^0^Z`bt9j5y(-4Jf1IEd%;iLH3fTbQRWjMTb+_>}%i4CXR3XviiVv%K9ItN*m75oZJ43biOuK zX9M%ZF2CaBuPfxe^qcg2b5*G(_qVS|DGxf<{*}42lEcP2Q}2Rh(OM6VW22$DK zl)^vs##nWu4`Zy7d(YgN+`3p%t#5v)Y|%)~wG+9pyj$dv$dl3F>teY7e&JVE z$xgQK3mCXgd{`I3g88#18`EiA9y8dR8ORQ1nb9LyT>6aZ(vs7dCSHLd_Dl{91O#(Um}Yce9*<+f zU~s))k;>%iusN(?$Xm`131adf8REn64q=9a;GjT;zOJqzBZ%o2Os9GJ=$X+2d3tmj zZzaUM10d$*p>{_8yAIFH~w4fv1vXbCQauY3!fHb-MRaT|LN zdhtMvgFBMdTyOTkb4qZsgAdwC&0bim63m6#4NK#M_TLd=_ zm$v{a*T*LTADe0(8rCreq9;LtEV_u*AU<5FNQ5E*@fj3+XbFtifh*t`HYElQn1WBy zcf~0aNaIqtG;a4@01@zhINzL1G#B@mIo1jnKGq3U7r?ea0Dz6c;flFf2oZWq^;(Ne z*o==Y+iO3pbhcOmh$554D54^wWQqvv3A`7u9vtSwHb4+;FK`jwU|@kh8f-zZ1)Cwk zkUtgVkU_GbX9YEg3n2kTVa_}h0d+i~Cf-6Z=+!<ey>&!H zp~Yg9!RuH;d910oCW7{gpib(SRMb&TMhI_yBBZBKGfN+UtAtL=Z*xZTsJH z3644i2?i697E}~ohG+q*c!sDi$(Ixrr9TA^YZbw9mcCO)LzFRiG3a6>O@G|&=qvkeA{9-VaHtNOTmj}e z{G!772>{&PI{|q5$^zi2tpI?hqY?m~b zXAUbch!sM!4dt*xLZPn-d3t@a@r%;l&ntw@WiRK^T-hPsG(CMCSOo#!9Drt>*1*IZ zUwXn0;nRSExRpsQj$>Ov(*-sI@&TOFC!f^zT5_SG4Ns1~{E zR1(!9l0OWoQ*$HvBz5Ya8`l86&m#E}0gD3`dnt#P1}x^Lc`5sR>hmc#t*oy->b+&C z+%)!TZrbW5_UcGJH!ZxBn|AX5(Bl7>rG$F_{Lv}2xzOhSwlN@<122k5J|!DoGYG$C zgh6Nlp#_8%5L!TJ0igx{Mi%%j|DSxL>PDK5n(Ve@@c*^D^Ra(>9e}?9VE=U)0Q>pA z0KA#Z0bsv;C14~V3V_Rg3BZ1J0s#B(c)tLjt;_&me;+RJVMfCfMlfb-t~;FG=} zj?);z@V~JDK1#+G{Qk{a@JqK4I41Ucf)WxX5iP@GNK68cCWOiaSHVB9>W#tG`==9v z|BN&d;!%BEy?(!->tp>cG8{%Pp&!$sRHi~9;=d}n!rh0!0Vpe=UU0MrAGITX@i!D% zs)$U&&kWIqq0|wS!4D4vQ8<`~wc_74V%vWP+LsI3ggHDoe20I5IC#(k{n+C8*&XWr z3;w~>Tm1v?{P(rT81xhOK2-hi%dXzZ0|*IO!W{;|-k~EL8T7|FzbnXremsZaIcuo; zv4Umz0-;wc3;Lc1zW~M0LSS#k5U)yb2OqnCqr1K1o}fb%J>+5^)Bjt7u I{M9Y+A4}7$pa1{> literal 0 HcmV?d00001