diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml
index fc7206925..14dcd3b4f 100644
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -41,6 +41,7 @@
-->
+ Initial ExtractorFactory support for building TextExtractors for embeded documents
Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required
Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor
45539 - Improve XWPFWordExtractor to extract headers and footers
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 94d1a4720..e92ca9b70 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -38,6 +38,7 @@
-->
+ Initial ExtractorFactory support for building TextExtractors for embeded documents
Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required
Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor
45539 - Improve XWPFWordExtractor to extract headers and footers
diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java
index d46c7e4aa..f198c1933 100644
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@@ -19,6 +19,7 @@ package org.apache.poi;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Common Parent for OLE2 based Text Extractors
@@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
+
+ /**
+ * Return the underlying POIFS FileSystem of
+ * this document.
+ */
+ public POIFSFileSystem getFileSystem() {
+ return document.filesystem;
+ }
}
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
index 0381a0457..b9750fc58 100644
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
this.wb = wb;
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
- this(new HSSFWorkbook(fs));
+ this(fs.getRoot(), fs);
+ }
+ public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HSSFWorkbook(dir, fs, true));
}
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
index 55a47065b..8e07afd1f 100644
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -18,9 +18,11 @@ package org.apache.poi.extractor;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
+import java.util.ArrayList;
import java.util.Iterator;
import org.apache.poi.POIOLE2TextExtractor;
@@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow;
@@ -105,24 +109,95 @@ public class ExtractorFactory {
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+ return createExtractor(fs.getRoot(), fs);
+ }
+ public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
// Look for certain entries in the stream, to figure it
// out from
- for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+ for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
Entry entry = (Entry)entries.next();
if(entry.getName().equals("Workbook")) {
- return new ExcelExtractor(fs);
+ return new ExcelExtractor(poifsDir, fs);
}
if(entry.getName().equals("WordDocument")) {
- return new WordExtractor(fs);
+ return new WordExtractor(poifsDir, fs);
}
if(entry.getName().equals("PowerPoint Document")) {
- return new PowerPointExtractor(fs);
+ return new PowerPointExtractor(poifsDir, fs);
}
if(entry.getName().equals("VisioDocument")) {
- return new VisioTextExtractor(fs);
+ return new VisioTextExtractor(poifsDir, fs);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
+
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embeded documents in the file (if there are any).
+ * If there are no embeded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embeded file.
+ */
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ // Find all the embeded directories
+ ArrayList dirs = new ArrayList();
+ POIFSFileSystem fs = ext.getFileSystem();
+ if(fs == null) {
+ throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+ }
+
+ if(ext instanceof ExcelExtractor) {
+ // These are in MBD... under the root
+ Iterator it = fs.getRoot().getEntries();
+ while(it.hasNext()) {
+ Entry entry = (Entry)it.next();
+ if(entry.getName().startsWith("MBD")) {
+ dirs.add(entry);
+ }
+ }
+ } else if(ext instanceof WordExtractor) {
+ // These are in ObjectPool -> _... under the root
+ try {
+ DirectoryEntry op = (DirectoryEntry)
+ fs.getRoot().getEntry("ObjectPool");
+ Iterator it = op.getEntries();
+ while(it.hasNext()) {
+ Entry entry = (Entry)it.next();
+ if(entry.getName().startsWith("_")) {
+ dirs.add(entry);
+ }
+ }
+ } catch(FileNotFoundException e) {}
+ } else if(ext instanceof PowerPointExtractor) {
+ // Tricky, not stored directly in poifs
+ // TODO
+ }
+
+ // Create the extractors
+ if(dirs == null || dirs.size() == 0) {
+ return new POITextExtractor[0];
+ }
+
+ POITextExtractor[] te = new POITextExtractor[dirs.size()];
+ for(int i=0; i 20);
+ }
+
+ // Word
+ f = new File(poifs_dir, "word_with_embeded.doc");
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(f);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ assertEquals(4, embeds.length);
+ assertTrue(embeds[0] instanceof WordExtractor);
+ assertTrue(embeds[1] instanceof ExcelExtractor);
+ assertTrue(embeds[2] instanceof ExcelExtractor);
+ assertTrue(embeds[3] instanceof PowerPointExtractor);
+ for(int i=0; i 20);
+ }
+
+ // TODO - PowerPoint
+ // TODO - Visio
+ }
}
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
index bb58933de..34a2f4f89 100644
--- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
@@ -64,6 +64,10 @@ public class ChunkFactory {
private void processChunkParseCommands() throws IOException {
String line;
InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
+ if(cpd == null) {
+ throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
+ }
+
BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
while( (line = inp.readLine()) != null ) {
if(line.startsWith("#")) continue;
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
index 9b1307cee..9b861d6d6 100644
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
import org.apache.poi.hdgf.streams.ChunkStream;
import org.apache.poi.hdgf.streams.PointerContainingStream;
import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
- this(new HDGFDiagram(fs));
+ this(fs.getRoot(), fs);
+ }
+ public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HDGFDiagram(dir, fs));
this.fs = fs;
}
public VisioTextExtractor(InputStream inp) throws IOException {
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
index 841bd38f9..cc0205647 100644
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
this(new HSLFSlideShow(fs));
}
+ public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HSLFSlideShow(dir, fs));
+ }
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
index deaffc79a..816b2c122 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
this(new HWPFDocument(fs));
this.fs = fs;
}
+ public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HWPFDocument(dir, fs));
+ this.fs = fs;
+ }
/**
* Create a new Word Extractor
diff --git a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx
new file mode 100755
index 000000000..411de4a08
Binary files /dev/null and b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx differ