Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
326cf56aa7
commit
f751e3cdd3
@ -191,10 +191,11 @@ public class ExtractorFactory {
|
||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
return createExtractor(fs.getRoot(), fs);
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
|
||||
@ -234,6 +235,12 @@ public class ExtractorFactory {
|
||||
) {
|
||||
return new OutlookTextExtactor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(
|
||||
poifsDir.createDocumentInputStream(entry.getName())
|
||||
);
|
||||
return createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
@ -246,7 +253,7 @@ public class ExtractorFactory {
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// All the embded directories we spotted
|
||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
|
@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase {
|
||||
private File docx;
|
||||
private File dotx;
|
||||
private File docEmb;
|
||||
private File docEmbOOXML;
|
||||
|
||||
private File ppt;
|
||||
private File pptx;
|
||||
@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase {
|
||||
docx = wpTests.getFile("SampleDoc.docx");
|
||||
dotx = wpTests.getFile("test.dotx");
|
||||
docEmb = wpTests.getFile("word_with_embeded.doc");
|
||||
docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
|
||||
|
||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
ppt = slTests.getFile("SampleShow.ppt");
|
||||
@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase {
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
assertEquals(6, embeds.length);
|
||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
|
||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
|
||||
@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase {
|
||||
assertEquals(1, numWord);
|
||||
assertEquals(0, numMsg);
|
||||
|
||||
// Word which contains an OOXML file
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(docEmbOOXML);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
|
||||
assertEquals(3, embeds.length);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||
else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
|
||||
}
|
||||
assertEquals(1, numPpt);
|
||||
assertEquals(1, numXls);
|
||||
assertEquals(0, numWord);
|
||||
assertEquals(1, numWordX);
|
||||
assertEquals(0, numMsg);
|
||||
|
||||
// Outlook
|
||||
ext = (OutlookTextExtactor)
|
||||
ExtractorFactory.createExtractor(msgEmb);
|
||||
|
BIN
test-data/document/word_with_embeded_ooxml.doc
Normal file
BIN
test-data/document/word_with_embeded_ooxml.doc
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user