Bug 47304: use fixed encoding when extracting text in WordDocument

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668367 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2015-03-22 13:33:43 +00:00
parent 0216a881f7
commit f9961331ff
4 changed files with 84 additions and 2 deletions

View File

@ -18,12 +18,21 @@ package org.apache.poi.stress;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import org.apache.poi.hdf.extractor.WordDocument;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.junit.Test; import org.junit.Test;
@SuppressWarnings("deprecation")
public class HWPFFileHandler extends POIFSFileHandler { public class HWPFFileHandler extends POIFSFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {
@ -33,16 +42,53 @@ public class HWPFFileHandler extends POIFSFileHandler {
assertNotNull(doc.getEndnotes()); assertNotNull(doc.getEndnotes());
handlePOIDocument(doc); handlePOIDocument(doc);
// fails for many documents, but is deprecated anyway...
// handleWordDocument(doc);
} }
protected void handleWordDocument(HWPFDocument doc) throws IOException {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
doc.write(outStream);
WordDocument wordDoc = new WordDocument(new ByteArrayInputStream(outStream.toByteArray()));
StringWriter docTextWriter = new StringWriter();
PrintWriter out = new PrintWriter(docTextWriter);
try {
wordDoc.writeAllText(out);
} finally {
out.close();
}
docTextWriter.close();
}
// a test-case to test this locally without executing the full TestAllFiles // a test-case to test this locally without executing the full TestAllFiles
@Test @Test
public void test() throws Exception { public void test() throws Exception {
InputStream stream = new FileInputStream("test-data/document/HeaderFooterUnicode.doc"); File file = new File("test-data/document/47304.doc");
InputStream stream = new FileInputStream(file);
try { try {
handleFile(stream); handleFile(stream);
} finally { } finally {
stream.close(); stream.close();
} }
handleExtracting(file);
stream = new FileInputStream(file);
try {
WordExtractor extractor = new WordExtractor(stream);
try {
assertNotNull(extractor.getText());
} finally {
extractor.close();
}
} finally {
stream.close();
}
} }
} }

View File

@ -177,7 +177,7 @@ public final class WordDocument {
} }
else else
{ {
String sText = new String(_header, start, end-start); String sText = new String(_header, start, end-start, "windows-1252");
out.write(sText); out.write(sText);
} }
} }

View File

@ -17,6 +17,15 @@
package org.apache.poi.hdf.extractor; package org.apache.poi.hdf.extractor;
import static org.junit.Assert.*;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.junit.Test; import org.junit.Test;
@ -31,4 +40,31 @@ public class TestWordDocument {
//WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"}); //WordDocument.main(new String[] {"test-data/document/Word6.doc", "/tmp/test.doc"});
WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"}); WordDocument.main(new String[] {"test-data/document/53446.doc", "/tmp/test.doc"});
} }
@SuppressWarnings("deprecation")
@Test
public void test47304() throws IOException {
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("47304.doc");
assertNotNull(doc);
WordExtractor extractor = new WordExtractor(doc);
String text = extractor.getText();
//System.out.println(text);
assertTrue("Had: " + text, text.contains("Just a \u201Ctest\u201D"));
extractor.close();
WordDocument wordDoc = new WordDocument("test-data/document/47304.doc");
StringWriter docTextWriter = new StringWriter();
PrintWriter out = new PrintWriter(docTextWriter);
try {
wordDoc.writeAllText(out);
} finally {
out.close();
}
docTextWriter.close();
//System.out.println(docTextWriter.toString());
assertTrue("Had: " + docTextWriter.toString(), docTextWriter.toString().contains("Just a \u201Ctest\u201D"));
}
} }

Binary file not shown.