Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
326cf56aa7
commit
f751e3cdd3
@ -191,10 +191,11 @@ public class ExtractorFactory {
|
|||||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
return createExtractor(fs.getRoot(), fs);
|
// Only ever an OLE2 one from the root of the FS
|
||||||
|
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
// Look for certain entries in the stream, to figure it
|
// Look for certain entries in the stream, to figure it
|
||||||
// out from
|
// out from
|
||||||
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
|
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
|
||||||
@ -234,6 +235,12 @@ public class ExtractorFactory {
|
|||||||
) {
|
) {
|
||||||
return new OutlookTextExtactor(poifsDir, fs);
|
return new OutlookTextExtactor(poifsDir, fs);
|
||||||
}
|
}
|
||||||
|
if(entry.getName().equals("Package")) {
|
||||||
|
OPCPackage pkg = OPCPackage.open(
|
||||||
|
poifsDir.createDocumentInputStream(entry.getName())
|
||||||
|
);
|
||||||
|
return createExtractor(pkg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||||
}
|
}
|
||||||
@ -246,7 +253,7 @@ public class ExtractorFactory {
|
|||||||
* empty array. Otherwise, you'll get one open
|
* empty array. Otherwise, you'll get one open
|
||||||
* {@link POITextExtractor} for each embeded file.
|
* {@link POITextExtractor} for each embeded file.
|
||||||
*/
|
*/
|
||||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
// All the embded directories we spotted
|
// All the embded directories we spotted
|
||||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||||
// For anything else not directly held in as a POIFS directory
|
// For anything else not directly held in as a POIFS directory
|
||||||
|
@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
private File docx;
|
private File docx;
|
||||||
private File dotx;
|
private File dotx;
|
||||||
private File docEmb;
|
private File docEmb;
|
||||||
|
private File docEmbOOXML;
|
||||||
|
|
||||||
private File ppt;
|
private File ppt;
|
||||||
private File pptx;
|
private File pptx;
|
||||||
@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
docx = wpTests.getFile("SampleDoc.docx");
|
docx = wpTests.getFile("SampleDoc.docx");
|
||||||
dotx = wpTests.getFile("test.dotx");
|
dotx = wpTests.getFile("test.dotx");
|
||||||
docEmb = wpTests.getFile("word_with_embeded.doc");
|
docEmb = wpTests.getFile("word_with_embeded.doc");
|
||||||
|
docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
|
||||||
|
|
||||||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||||
ppt = slTests.getFile("SampleShow.ppt");
|
ppt = slTests.getFile("SampleShow.ppt");
|
||||||
@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
assertEquals(6, embeds.length);
|
assertEquals(6, embeds.length);
|
||||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
|
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
|
||||||
for(int i=0; i<embeds.length; i++) {
|
for(int i=0; i<embeds.length; i++) {
|
||||||
assertTrue(embeds[i].getText().length() > 20);
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
|
|
||||||
@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase {
|
|||||||
assertEquals(1, numWord);
|
assertEquals(1, numWord);
|
||||||
assertEquals(0, numMsg);
|
assertEquals(0, numMsg);
|
||||||
|
|
||||||
|
// Word which contains an OOXML file
|
||||||
|
ext = (POIOLE2TextExtractor)
|
||||||
|
ExtractorFactory.createExtractor(docEmbOOXML);
|
||||||
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||||
|
|
||||||
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
|
||||||
|
assertEquals(3, embeds.length);
|
||||||
|
for(int i=0; i<embeds.length; i++) {
|
||||||
|
assertTrue(embeds[i].getText().length() > 20);
|
||||||
|
if(embeds[i] instanceof PowerPointExtractor) numPpt++;
|
||||||
|
else if(embeds[i] instanceof ExcelExtractor) numXls++;
|
||||||
|
else if(embeds[i] instanceof WordExtractor) numWord++;
|
||||||
|
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
|
||||||
|
else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
|
||||||
|
}
|
||||||
|
assertEquals(1, numPpt);
|
||||||
|
assertEquals(1, numXls);
|
||||||
|
assertEquals(0, numWord);
|
||||||
|
assertEquals(1, numWordX);
|
||||||
|
assertEquals(0, numMsg);
|
||||||
|
|
||||||
// Outlook
|
// Outlook
|
||||||
ext = (OutlookTextExtactor)
|
ext = (OutlookTextExtactor)
|
||||||
ExtractorFactory.createExtractor(msgEmb);
|
ExtractorFactory.createExtractor(msgEmb);
|
||||||
|
BIN
test-data/document/word_with_embeded_ooxml.doc
Normal file
BIN
test-data/document/word_with_embeded_ooxml.doc
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user