Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-12-16 07:39:21 +00:00
parent 326cf56aa7
commit f751e3cdd3
3 changed files with 35 additions and 5 deletions

View File

@ -191,10 +191,11 @@ public class ExtractorFactory {
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot(), fs); // Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
} }
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Look for certain entries in the stream, to figure it // Look for certain entries in the stream, to figure it
// out from // out from
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) { for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
@ -234,6 +235,12 @@ public class ExtractorFactory {
) { ) {
return new OutlookTextExtactor(poifsDir, fs); return new OutlookTextExtactor(poifsDir, fs);
} }
if(entry.getName().equals("Package")) {
OPCPackage pkg = OPCPackage.open(
poifsDir.createDocumentInputStream(entry.getName())
);
return createExtractor(pkg);
}
} }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
} }
@ -246,7 +253,7 @@ public class ExtractorFactory {
* empty array. Otherwise, you'll get one open * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file. * {@link POITextExtractor} for each embeded file.
*/ */
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// All the embded directories we spotted // All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>(); ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory // For anything else not directly held in as a POIFS directory

View File

@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase {
private File docx; private File docx;
private File dotx; private File dotx;
private File docEmb; private File docEmb;
private File docEmbOOXML;
private File ppt; private File ppt;
private File pptx; private File pptx;
@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase {
docx = wpTests.getFile("SampleDoc.docx"); docx = wpTests.getFile("SampleDoc.docx");
dotx = wpTests.getFile("test.dotx"); dotx = wpTests.getFile("test.dotx");
docEmb = wpTests.getFile("word_with_embeded.doc"); docEmb = wpTests.getFile("word_with_embeded.doc");
docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ppt = slTests.getFile("SampleShow.ppt"); ppt = slTests.getFile("SampleShow.ppt");
@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase {
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length); assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0; int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for(int i=0; i<embeds.length; i++) { for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20); assertTrue(embeds[i].getText().length() > 20);
@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase {
assertEquals(1, numWord); assertEquals(1, numWord);
assertEquals(0, numMsg); assertEquals(0, numMsg);
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
assertEquals(3, embeds.length);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
else if(embeds[i] instanceof ExcelExtractor) numXls++;
else if(embeds[i] instanceof WordExtractor) numWord++;
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
// Outlook // Outlook
ext = (OutlookTextExtactor) ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb); ExtractorFactory.createExtractor(msgEmb);

Binary file not shown.