resolved old bugzilla issues, added unit tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1139204 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aac4cf50a9
commit
bc467bb8c1
@ -4,6 +4,10 @@ import java.io.File;
|
|||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FilenameFilter;
|
import java.io.FilenameFilter;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import javax.xml.parsers.DocumentBuilderFactory;
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
@ -24,28 +28,29 @@ import org.apache.poi.hwpf.HWPFDocument;
|
|||||||
|
|
||||||
public class TestWordToFoExtractorSuite
|
public class TestWordToFoExtractorSuite
|
||||||
{
|
{
|
||||||
public static Test suite()
|
/**
|
||||||
{
|
* YK: a quick hack to exclude failing documents from the suite.
|
||||||
|
*
|
||||||
|
* WordToFoExtractor stumbles on Bug33519.doc with a NPE
|
||||||
|
*/
|
||||||
|
private static List<String> failingFiles = Arrays.asList("Bug33519.doc");
|
||||||
|
|
||||||
|
public static Test suite() {
|
||||||
TestSuite suite = new TestSuite();
|
TestSuite suite = new TestSuite();
|
||||||
|
|
||||||
File directory = POIDataSamples.getDocumentInstance().getFile(
|
File directory = POIDataSamples.getDocumentInstance().getFile(
|
||||||
"../document" );
|
"../document");
|
||||||
for ( final File child : directory.listFiles( new FilenameFilter()
|
for (final File child : directory.listFiles(new FilenameFilter() {
|
||||||
{
|
public boolean accept(File dir, String name) {
|
||||||
public boolean accept( File dir, String name )
|
return name.endsWith(".doc") && !failingFiles.contains(name);
|
||||||
{
|
|
||||||
return name.endsWith( ".doc" );
|
|
||||||
}
|
}
|
||||||
} ) )
|
})) {
|
||||||
{
|
|
||||||
final String name = child.getName();
|
final String name = child.getName();
|
||||||
suite.addTest( new TestCase( name )
|
suite.addTest(new TestCase(name) {
|
||||||
{
|
public void runTest() throws Exception {
|
||||||
public void runTest() throws Exception
|
test(child);
|
||||||
{
|
|
||||||
test( child );
|
|
||||||
}
|
}
|
||||||
} );
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return suite;
|
return suite;
|
||||||
|
@ -17,12 +17,21 @@
|
|||||||
|
|
||||||
package org.apache.poi.hwpf.usermodel;
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import junit.framework.AssertionFailedError;
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.poi.EncryptedDocumentException;
|
import org.apache.poi.EncryptedDocumentException;
|
||||||
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
import org.apache.poi.hwpf.HWPFTestCase;
|
import org.apache.poi.hwpf.HWPFTestCase;
|
||||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||||
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.hwpf.model.StyleSheet;
|
import org.apache.poi.hwpf.model.StyleSheet;
|
||||||
|
import org.apache.poi.util.IOUtils;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test various problem documents
|
* Test various problem documents
|
||||||
@ -418,4 +427,268 @@ public final class TestProblems extends HWPFTestCase {
|
|||||||
assertEquals(119, cell.getEndOffset());
|
assertEquals(119, cell.getEndOffset());
|
||||||
assertEquals("Row 3/Cell 3\u0007", cell.text());
|
assertEquals("Row 3/Cell 3\u0007", cell.text());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void fixed(String bugzillaId) {
|
||||||
|
fail("Bug " + bugzillaId + " seems to be fixed. " +
|
||||||
|
"Please resolve the issue in Bugzilla and remove fail() from the test");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 33519 - HWPF fails to read a file
|
||||||
|
*/
|
||||||
|
public void test33519() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug33519.doc");
|
||||||
|
WordExtractor extractor = new WordExtractor(doc);
|
||||||
|
String text = extractor.getText();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 34898 - WordExtractor doesn't read the whole string from the file
|
||||||
|
*/
|
||||||
|
public void test34898() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug34898.doc");
|
||||||
|
WordExtractor extractor = new WordExtractor(doc);
|
||||||
|
assertEquals("\u30c7\u30a3\u30ec\u30af\u30c8\u30ea", extractor.getText().trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 44331 - Output is corrupted
|
||||||
|
*/
|
||||||
|
public void test44431() {
|
||||||
|
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug44431.doc");
|
||||||
|
WordExtractor extractor1 = new WordExtractor(doc1);
|
||||||
|
|
||||||
|
HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
|
||||||
|
WordExtractor extractor2 = new WordExtractor(doc2);
|
||||||
|
try {
|
||||||
|
assertEquals(extractor1.getFooterText(), extractor2.getFooterText());
|
||||||
|
assertEquals(extractor1.getHeaderText(), extractor2.getHeaderText());
|
||||||
|
assertEquals(extractor1.getParagraphText(), extractor2.getParagraphText());
|
||||||
|
|
||||||
|
assertEquals(extractor1.getText(), extractor2.getText());
|
||||||
|
|
||||||
|
fixed("44431");
|
||||||
|
} catch (AssertionFailedError e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 46817 - Text from tables is not extracted
|
||||||
|
*/
|
||||||
|
public void test46817() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug46817.doc");
|
||||||
|
WordExtractor extractor = new WordExtractor(doc);
|
||||||
|
String text = extractor.getText().trim();
|
||||||
|
try {
|
||||||
|
assertTrue(text.contains("Nazwa wykonawcy"));
|
||||||
|
assertTrue(text.contains("kujawsko-pomorskie"));
|
||||||
|
assertTrue(text.contains("ekomel@ekomel.com.pl"));
|
||||||
|
|
||||||
|
fixed("46817");
|
||||||
|
} catch (AssertionFailedError e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 46220 - images are not properly extracted
|
||||||
|
*/
|
||||||
|
public void test46220() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug46220.doc");
|
||||||
|
// reference checksums as in Bugzilla
|
||||||
|
String[] md5 = {
|
||||||
|
"851be142bce6d01848e730cb6903f39e",
|
||||||
|
"7fc6d8fb58b09ababd036d10a0e8c039",
|
||||||
|
"a7dc644c40bc2fbf17b2b62d07f99248",
|
||||||
|
"72d07b8db5fad7099d90bc4c304b4666"
|
||||||
|
};
|
||||||
|
List<Picture> pics = doc.getPicturesTable().getAllPictures();
|
||||||
|
assertEquals(4, pics.size());
|
||||||
|
for (int i = 0; i < pics.size(); i++) {
|
||||||
|
Picture pic = pics.get(i);
|
||||||
|
byte[] data = pic.getRawContent();
|
||||||
|
// use Apache Commons Codec utils to compute md5
|
||||||
|
assertEquals(md5[i], DigestUtils.md5Hex(data));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 45473 - HWPF cannot read file after save
|
||||||
|
*/
|
||||||
|
public void test45473() {
|
||||||
|
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug45473.doc");
|
||||||
|
String text1 = new WordExtractor(doc1).getText().trim();
|
||||||
|
|
||||||
|
HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
|
||||||
|
String text2 = new WordExtractor(doc2).getText().trim();
|
||||||
|
|
||||||
|
// the text in the saved document has some differences in line separators but we tolerate that
|
||||||
|
assertEquals(text1.replaceAll("\n", ""), text2.replaceAll("\n", ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 47287 - StringIndexOutOfBoundsException in CharacterRun.replaceText()
|
||||||
|
*/
|
||||||
|
public void test47287() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug47287.doc");
|
||||||
|
String[] values = {
|
||||||
|
"1-1",
|
||||||
|
"1-2",
|
||||||
|
"1-3",
|
||||||
|
"1-4",
|
||||||
|
"1-5",
|
||||||
|
"1-6",
|
||||||
|
"1-7",
|
||||||
|
"1-8",
|
||||||
|
"1-9",
|
||||||
|
"1-10",
|
||||||
|
"1-11",
|
||||||
|
"1-12",
|
||||||
|
"1-13",
|
||||||
|
"1-14",
|
||||||
|
"1-15",
|
||||||
|
};
|
||||||
|
int usedVal = 0;
|
||||||
|
try {
|
||||||
|
String PLACEHOLDER = "\u2002\u2002\u2002\u2002\u2002";
|
||||||
|
Range r = doc.getRange();
|
||||||
|
for (int x = 0; x < r.numSections(); x++) {
|
||||||
|
Section s = r.getSection(x);
|
||||||
|
for (int y = 0; y < s.numParagraphs(); y++) {
|
||||||
|
Paragraph p = s.getParagraph(y);
|
||||||
|
|
||||||
|
for (int z = 0; z < p.numCharacterRuns(); z++) {
|
||||||
|
boolean isFound = false;
|
||||||
|
|
||||||
|
//character run
|
||||||
|
CharacterRun run = p.getCharacterRun(z);
|
||||||
|
//character run text
|
||||||
|
String text = run.text();
|
||||||
|
String oldText = text;
|
||||||
|
int c = text.indexOf("FORMTEXT ");
|
||||||
|
if (c < 0) {
|
||||||
|
int k = text.indexOf(PLACEHOLDER);
|
||||||
|
if (k >= 0) {
|
||||||
|
text = text.substring(0, k) + values[usedVal] + text.substring(k + PLACEHOLDER.length());
|
||||||
|
usedVal++;
|
||||||
|
isFound = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (; c >= 0; c = text.indexOf("FORMTEXT ", c + "FORMTEXT ".length())) {
|
||||||
|
int k = text.indexOf(PLACEHOLDER, c);
|
||||||
|
if (k >= 0) {
|
||||||
|
text = text.substring(0, k) + values[usedVal] + text.substring(k + PLACEHOLDER.length());
|
||||||
|
usedVal++;
|
||||||
|
isFound = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isFound) {
|
||||||
|
run.replaceText(oldText, text, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fixed("47287");
|
||||||
|
} catch (StringIndexOutOfBoundsException e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void insertTable(int rows, int columns) {
|
||||||
|
// POI apparently can't create a document from scratch,
|
||||||
|
// so we need an existing empty dummy document
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("empty.doc");
|
||||||
|
|
||||||
|
Range range = doc.getRange();
|
||||||
|
Table table = range.insertBefore(new TableProperties(columns), rows);
|
||||||
|
|
||||||
|
for (int rowIdx = 0; rowIdx < table.numRows(); rowIdx++) {
|
||||||
|
TableRow row = table.getRow(rowIdx);
|
||||||
|
for (int colIdx = 0; colIdx < row.numCells(); colIdx++) {
|
||||||
|
TableCell cell = row.getCell(colIdx);
|
||||||
|
Paragraph par = cell.getParagraph(0);
|
||||||
|
par.insertBefore("" + (rowIdx * row.numCells() + colIdx));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 47563 - HWPF failing while creating tables,
|
||||||
|
*/
|
||||||
|
public void test47563() {
|
||||||
|
try {
|
||||||
|
insertTable(1, 5);
|
||||||
|
insertTable(1, 6);
|
||||||
|
insertTable(5, 1);
|
||||||
|
insertTable(6, 1);
|
||||||
|
insertTable(2, 2);
|
||||||
|
insertTable(3, 2);
|
||||||
|
insertTable(2, 3);
|
||||||
|
insertTable(3, 3);
|
||||||
|
|
||||||
|
fixed("47563");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 4774 - text extracted by WordExtractor is broken
|
||||||
|
*/
|
||||||
|
public void test47742() throws Exception {
|
||||||
|
|
||||||
|
// (1) extract text from MS Word document via POI
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug47742.doc");
|
||||||
|
String foundText = new WordExtractor(doc).getText();
|
||||||
|
|
||||||
|
// (2) read text from text document (retrieved by saving the word
|
||||||
|
// document as text file using encoding UTF-8)
|
||||||
|
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream("Bug47742-text.txt");
|
||||||
|
byte[] expectedBytes = IOUtils.toByteArray(is);
|
||||||
|
String expectedText = new String(expectedBytes, "utf-8").substring(1); // strip-off the unicode marker
|
||||||
|
|
||||||
|
assertEquals(expectedText, foundText);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 47958 - Exception during Escher walk of pictures
|
||||||
|
*/
|
||||||
|
public void test47958() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug47958.doc");
|
||||||
|
try {
|
||||||
|
for (Picture pic : doc.getPicturesTable().getAllPictures()) {
|
||||||
|
System.out.println(pic.suggestFullFileName());
|
||||||
|
}
|
||||||
|
fixed("47958");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 50936 - HWPF fails to read a file
|
||||||
|
*/
|
||||||
|
public void test50936() {
|
||||||
|
HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug50936.doc");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [FAILING] Bug 50955 - error while retrieving the text file
|
||||||
|
*/
|
||||||
|
public void test50955() {
|
||||||
|
try {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
|
||||||
|
Word6Extractor extractor = new Word6Extractor(doc);
|
||||||
|
String text = extractor.getText();
|
||||||
|
fixed("50955");
|
||||||
|
} catch (Exception e) {
|
||||||
|
// expected exception
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
BIN
test-data/document/Bug33519.doc
Normal file
BIN
test-data/document/Bug33519.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug34898.doc
Normal file
BIN
test-data/document/Bug34898.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug44431.doc
Normal file
BIN
test-data/document/Bug44431.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug45473.doc
Normal file
BIN
test-data/document/Bug45473.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug46220.doc
Normal file
BIN
test-data/document/Bug46220.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug46817.doc
Normal file
BIN
test-data/document/Bug46817.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug47287.doc
Normal file
BIN
test-data/document/Bug47287.doc
Normal file
Binary file not shown.
35
test-data/document/Bug47742-text.txt
Executable file
35
test-data/document/Bug47742-text.txt
Executable file
@ -0,0 +1,35 @@
|
|||||||
|
{0>Der Aaa Satz.<}0{>The Aaa phrase.<0} {0>Der Bbb Satz.<}0{>The Bbb phrase.<0} {0>Der Ccc Satz.<}0{>The Ccc phrase.<0} {0>Der Ddd Satz.<}0{>The Ddd phrase.<0}
|
||||||
|
{0>Der Eee Satz.<}0{>The Eee phrase.<0} {0>Der Fff Satz.<}0{>The Fff phrase.<0}
|
||||||
|
{0>Der Ggg Satz .<}0{>The Ggg phrase .<0}
|
||||||
|
{0>Der Gggggg Satz .<}0{>The Gggggg phrase .<0}
|
||||||
|
{0>Ein Zeil
|
||||||
|
enumbruch mittendrin.<}0{>A soft
|
||||||
|
return in the center.<0}
|
||||||
|
{0>Ein Satz
|
||||||
|
mit soft return.<}0{>A sentence
|
||||||
|
with soft return.<0}
|
||||||
|
|
||||||
|
{0>Ein-Strich.<}0{>One-hyphen.<0}
|
||||||
|
{0>Die G-20 Staaten.<}0{>The G-20 states.<0}
|
||||||
|
{0>Ein—Geviertstrich hier.<}0{>An—EMdash here.<0}
|
||||||
|
{0>Ein/Schrägstrich hier.<}0{>A/slash here.<0}
|
||||||
|
{0>Senkrechter|Strich hier.<}0{>Vertical|line here.<0}
|
||||||
|
{0>Umgekehrter\Schrägstrich hier.<}0{>A\backslash here.<0}
|
||||||
|
{0>C'est la vie.<}0{>Such is life.<0}
|
||||||
|
{0>Das sind 10'000 Euros.<}0{>These are 10'000 Euros.<0}
|
||||||
|
{0>Eine Komma,Trennung hier.<}0{>A comma,separation here.<0}
|
||||||
|
{0>Eine Semikolon;Trennung hier.<}0{>A semicolon;separation here.<0}
|
||||||
|
{0>Das sind 77,mehr hier.<}0{>There are 77,more here.<0}
|
||||||
|
{0>Das ist sein (Netto)Gehalt<}0{>This is his (net)salary.<0}
|
||||||
|
{0>Das sind 50$ hier.<}0{>That is 50$ here.<0}
|
||||||
|
{0>Das sind 3%Rabatt.<}0{>That is 3% discount.<0}
|
||||||
|
{0>Es sind 25°C heute.<}0{>It is 25°C today.<0}
|
||||||
|
{0>Es gilt Yen<Dollar.<}0{>It is Yen<Dollar.<0}
|
||||||
|
{0>Keine Trennung® bei Sonderzeichen.<}0{>No separation® here..<0}
|
||||||
|
{0>Ich zahle 7 Euro.<}0{>I pay 7 Euros.<0}
|
||||||
|
{0>Die Disk ist 6 min lang.<}0{>The disk is 6 min long.<0}
|
||||||
|
{0>Ein Satz, mit Komma.<}0{>A sentence, with comma.<0}
|
||||||
|
{0>Ein Hochkomma hier.<}0{>An apostrophe here.<0}
|
||||||
|
{0>Ein Satz mit verschiedenen Pausen.<}0{>A sentence with different blanks.<0}
|
||||||
|
{0>Ein Satz mit geschützten Pausen.<}0{>A sentence with non-breaking blanks.<0}
|
||||||
|
{0>Ein Satz mit speziellen Pausen.<}0{>A sentence with special blanks.<0}
|
BIN
test-data/document/Bug47742.doc
Normal file
BIN
test-data/document/Bug47742.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug47958.doc
Normal file
BIN
test-data/document/Bug47958.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug50936.doc
Normal file
BIN
test-data/document/Bug50936.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug50955.doc
Normal file
BIN
test-data/document/Bug50955.doc
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user