Enable Word6Extractor in ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
30848a80aa
commit
6ee6d9095f
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-beta2" date="2010-??-??">
|
<release version="3.7-beta2" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
|
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
|
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">XSLFSlideShow shouldn't break on .thmx (theme) files. Support for them is still very limited though</action>
|
<action dev="POI-DEVELOPERS" type="fix">XSLFSlideShow shouldn't break on .thmx (theme) files. Support for them is still very limited though</action>
|
||||||
|
@ -38,6 +38,8 @@ import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
|||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||||
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
@ -218,7 +220,12 @@ public class ExtractorFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(entry.getName().equals("WordDocument")) {
|
if(entry.getName().equals("WordDocument")) {
|
||||||
return new WordExtractor(poifsDir, fs);
|
// Old or new style word document?
|
||||||
|
try {
|
||||||
|
return new WordExtractor(poifsDir, fs);
|
||||||
|
} catch(OldWordFileFormatException e) {
|
||||||
|
return new Word6Extractor(poifsDir, fs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if(entry.getName().equals("PowerPoint Document")) {
|
if(entry.getName().equals("PowerPoint Document")) {
|
||||||
return new PowerPointExtractor(poifsDir, fs);
|
return new PowerPointExtractor(poifsDir, fs);
|
||||||
@ -230,12 +237,12 @@ public class ExtractorFactory {
|
|||||||
return new PublisherTextExtractor(poifsDir, fs);
|
return new PublisherTextExtractor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
if(
|
if(
|
||||||
entry.getName().equals("__substg1.0_1000001E") ||
|
entry.getName().equals("__substg1.0_1000001E") ||
|
||||||
entry.getName().equals("__substg1.0_1000001F") ||
|
entry.getName().equals("__substg1.0_1000001F") ||
|
||||||
entry.getName().equals("__substg1.0_0047001E") ||
|
entry.getName().equals("__substg1.0_0047001E") ||
|
||||||
entry.getName().equals("__substg1.0_0047001F") ||
|
entry.getName().equals("__substg1.0_0047001F") ||
|
||||||
entry.getName().equals("__substg1.0_0037001E") ||
|
entry.getName().equals("__substg1.0_0037001E") ||
|
||||||
entry.getName().equals("__substg1.0_0037001F")
|
entry.getName().equals("__substg1.0_0037001F")
|
||||||
) {
|
) {
|
||||||
return new OutlookTextExtactor(poifsDir, fs);
|
return new OutlookTextExtactor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
|||||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||||
@ -54,6 +55,8 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
private File xlsEmb;
|
private File xlsEmb;
|
||||||
|
|
||||||
private File doc;
|
private File doc;
|
||||||
|
private File doc6;
|
||||||
|
private File doc95;
|
||||||
private File docx;
|
private File docx;
|
||||||
private File dotx;
|
private File dotx;
|
||||||
private File docEmb;
|
private File docEmb;
|
||||||
@ -79,6 +82,8 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
|
|
||||||
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
||||||
doc = wpTests.getFile("SampleDoc.doc");
|
doc = wpTests.getFile("SampleDoc.doc");
|
||||||
|
doc6 = wpTests.getFile("Word6.doc");
|
||||||
|
doc95 = wpTests.getFile("Word95.doc");
|
||||||
docx = wpTests.getFile("SampleDoc.docx");
|
docx = wpTests.getFile("SampleDoc.docx");
|
||||||
dotx = wpTests.getFile("test.dotx");
|
dotx = wpTests.getFile("test.dotx");
|
||||||
docEmb = wpTests.getFile("word_with_embeded.doc");
|
docEmb = wpTests.getFile("word_with_embeded.doc");
|
||||||
@ -135,6 +140,23 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
||||||
);
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc6)
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc6).getText().length() > 20
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc95)
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc95).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(docx)
|
ExtractorFactory.createExtractor(docx)
|
||||||
instanceof XWPFWordExtractor
|
instanceof XWPFWordExtractor
|
||||||
@ -231,6 +253,22 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
|
ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
|
||||||
);
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(doc6))
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(doc95))
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(new FileInputStream(docx))
|
ExtractorFactory.createExtractor(new FileInputStream(docx))
|
||||||
instanceof XWPFWordExtractor
|
instanceof XWPFWordExtractor
|
||||||
@ -311,6 +349,22 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
||||||
);
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
|
||||||
|
instanceof Word6Extractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
// PowerPoint
|
// PowerPoint
|
||||||
assertTrue(
|
assertTrue(
|
||||||
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
|
||||||
|
@ -169,7 +169,7 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||||||
|
|
||||||
// Is this document too old for us?
|
// Is this document too old for us?
|
||||||
if(_fib.getNFib() < 106) {
|
if(_fib.getNFib() < 106) {
|
||||||
throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
|
throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?");
|
||||||
}
|
}
|
||||||
|
|
||||||
// use the fib to determine the name of the table stream.
|
// use the fib to determine the name of the table stream.
|
||||||
|
Loading…
Reference in New Issue
Block a user