Integration tests: Expect exception for old word documents and still run the text extraction for them. Also add executing HPSFPropertiesExtractor where possible

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668483 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2015-03-22 21:47:44 +00:00
parent e6d6cbcbd1
commit ca9f22c289
4 changed files with 50 additions and 12 deletions

View File

@ -31,6 +31,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.stress.*; import org.apache.poi.stress.*;
import org.apache.tools.ant.DirectoryScanner; import org.apache.tools.ant.DirectoryScanner;
import org.junit.Test; import org.junit.Test;
@ -162,6 +163,20 @@ public class TestAllFiles {
HANDLERS.put("spreadsheet/test_properties1", new NullFileHandler()); HANDLERS.put("spreadsheet/test_properties1", new NullFileHandler());
} }
// Old Word Documents where we can at least extract some text
private static final Set<String> OLD_FILES = new HashSet<String>();
static {
OLD_FILES.add("document/Bug49933.doc");
OLD_FILES.add("document/Bug51944.doc");
OLD_FILES.add("document/Word6.doc");
OLD_FILES.add("document/Word6_sections.doc");
OLD_FILES.add("document/Word6_sections2.doc");
OLD_FILES.add("document/Word95.doc");
OLD_FILES.add("document/word95err.doc");
OLD_FILES.add("hpsf/TestMickey.doc");
OLD_FILES.add("document/52117.doc");
}
private static final Set<String> EXPECTED_FAILURES = new HashSet<String>(); private static final Set<String> EXPECTED_FAILURES = new HashSet<String>();
static { static {
// password protected files // password protected files
@ -202,15 +217,7 @@ public class TestAllFiles {
EXPECTED_FAILURES.add("spreadsheet/43493.xls"); EXPECTED_FAILURES.add("spreadsheet/43493.xls");
EXPECTED_FAILURES.add("spreadsheet/46904.xls"); EXPECTED_FAILURES.add("spreadsheet/46904.xls");
EXPECTED_FAILURES.add("document/56880.doc"); EXPECTED_FAILURES.add("document/56880.doc");
EXPECTED_FAILURES.add("document/Bug49933.doc");
EXPECTED_FAILURES.add("document/Bug50955.doc"); EXPECTED_FAILURES.add("document/Bug50955.doc");
EXPECTED_FAILURES.add("document/Bug51944.doc");
EXPECTED_FAILURES.add("document/Word6.doc");
EXPECTED_FAILURES.add("document/Word6_sections.doc");
EXPECTED_FAILURES.add("document/Word6_sections2.doc");
EXPECTED_FAILURES.add("document/Word95.doc");
EXPECTED_FAILURES.add("document/word95err.doc");
EXPECTED_FAILURES.add("hpsf/TestMickey.doc");
EXPECTED_FAILURES.add("slideshow/PPT95.ppt"); EXPECTED_FAILURES.add("slideshow/PPT95.ppt");
EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx");
EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx");
@ -269,17 +276,29 @@ public class TestAllFiles {
File inputFile = new File(ROOT_DIR, file); File inputFile = new File(ROOT_DIR, file);
try { try {
InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100); InputStream stream = new BufferedInputStream(new FileInputStream(inputFile), 64*1024);
try { try {
handler.handleFile(stream); handler.handleFile(stream);
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
EXPECTED_FAILURES.contains(file)); EXPECTED_FAILURES.contains(file));
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
OLD_FILES.contains(file));
} finally { } finally {
stream.close(); stream.close();
} }
handler.handleExtracting(inputFile); handler.handleExtracting(inputFile);
} catch (OldWordFileFormatException e) {
// for old word files we should still support extracting text
if(OLD_FILES.contains(file)) {
handler.handleExtracting(inputFile);
} else {
// check if we expect failure for this file
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
throw new Exception("While handling " + file, e);
}
}
} catch (Exception e) { } catch (Exception e) {
// check if we expect failure for this file // check if we expect failure for this file
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {

View File

@ -28,8 +28,10 @@ import java.io.InputStream;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
@ -89,6 +91,19 @@ public abstract class AbstractFileHandler implements FileHandler {
assertEquals("File should not be modified by extractor", modified, file.lastModified()); assertEquals("File should not be modified by extractor", modified, file.lastModified());
handleExtractingAsStream(file); handleExtractingAsStream(file);
if(extractor instanceof POIOLE2TextExtractor) {
HPSFPropertiesExtractor hpsfExtractor = new HPSFPropertiesExtractor((POIOLE2TextExtractor)extractor);
try {
assertNotNull(hpsfExtractor.getDocumentSummaryInformationText());
assertNotNull(hpsfExtractor.getSummaryInformationText());
String text = hpsfExtractor.getText();
//System.out.println(text);
assertNotNull(text);
} finally {
hpsfExtractor.close();
}
}
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) { if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
throw new Exception("While handling " + file, e); throw new Exception("While handling " + file, e);

View File

@ -63,12 +63,10 @@ public class HWPFFileHandler extends POIFSFileHandler {
docTextWriter.close(); docTextWriter.close();
} }
// a test-case to test this locally without executing the full TestAllFiles // a test-case to test this locally without executing the full TestAllFiles
@Test @Test
public void test() throws Exception { public void test() throws Exception {
File file = new File("test-data/document/51921-Word-Crash067.doc"); File file = new File("test-data/document/52117.doc");
InputStream stream = new FileInputStream(file); InputStream stream = new FileInputStream(file);
try { try {
@ -91,4 +89,10 @@ public class HWPFFileHandler extends POIFSFileHandler {
stream.close(); stream.close();
} }
} }
@Test
public void testExtractingOld() throws Exception {
File file = new File("test-data/document/52117.doc");
handleExtracting(file);
}
} }

Binary file not shown.