diff --git a/build.xml b/build.xml
index b4e10a008..4201e8155 100644
--- a/build.xml
+++ b/build.xml
@@ -521,6 +521,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hwpf/data"/>
+
@@ -556,6 +558,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hpsf/data"/>
+
@@ -585,6 +589,7 @@ under the License.
+
@@ -601,6 +606,7 @@ under the License.
+
@@ -639,6 +645,7 @@ under the License.
+
@@ -673,6 +680,7 @@ under the License.
+
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml
index c3b1f72e8..2bfb46d0e 100644
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,7 @@
+ Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document
Initial support for getting and changing chart and series titles
Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it
44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 2ceb4ea9e..9d50f53ed 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
+ Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document
Initial support for getting and changing chart and series titles
Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it
44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.
diff --git a/src/java/org/apache/poi/POIDocument.java b/src/java/org/apache/poi/POIDocument.java
index 075fa4538..01e50231c 100644
--- a/src/java/org/apache/poi/POIDocument.java
+++ b/src/java/org/apache/poi/POIDocument.java
@@ -29,6 +29,7 @@ import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
@@ -50,12 +51,23 @@ public abstract class POIDocument {
protected DocumentSummaryInformation dsInf;
/** The open POIFS FileSystem that contains our document */
protected POIFSFileSystem filesystem;
+ /** The directory that our document lives in */
+ protected DirectoryNode directory;
/** For our own logging use */
protected POILogger logger = POILogFactory.getLogger(this.getClass());
/* Have the property streams been read yet? (Only done on-demand) */
protected boolean initialized = false;
+
+
+ protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) {
+ this.filesystem = fs;
+ this.directory = dir;
+ }
+ protected POIDocument(POIFSFileSystem fs) {
+ this(fs.getRoot(), fs);
+ }
/**
* Fetch the Document Summary Information of the document
@@ -110,7 +122,7 @@ public abstract class POIDocument {
DocumentInputStream dis;
try {
// Find the entry, and get an input stream for it
- dis = filesystem.createDocumentInputStream(setName);
+ dis = directory.createDocumentInputStream(setName);
} catch(IOException ie) {
// Oh well, doesn't exist
logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie);
diff --git a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java
index 3838e634d..1eb4f7c8c 100644
--- a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java
@@ -139,6 +139,7 @@ public class HSSFWorkbook extends POIDocument
protected HSSFWorkbook( Workbook book )
{
+ super(null, null);
workbook = book;
sheets = new ArrayList( INITIAL_CAPACITY );
names = new ArrayList( INITIAL_CAPACITY );
@@ -164,8 +165,8 @@ public class HSSFWorkbook extends POIDocument
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
+ super(fs);
this.preserveNodes = preserveNodes;
- this.filesystem = fs;
// If we're not preserving nodes, don't track the
// POIFS any more
diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
index cb8039033..6805e5197 100644
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
@@ -105,6 +105,31 @@ public class DirectoryNode
{
return _path;
}
+
+ /**
+ * open a document in the directory's entry's list of entries
+ *
+ * @param documentName the name of the document to be opened
+ *
+ * @return a newly opened DocumentInputStream
+ *
+ * @exception IOException if the document does not exist or the
+ * name is that of a DirectoryEntry
+ */
+
+ public DocumentInputStream createDocumentInputStream(
+ final String documentName)
+ throws IOException
+ {
+ Entry document = getEntry(documentName);
+
+ if (!document.isDocumentEntry())
+ {
+ throw new IOException("Entry '" + documentName
+ + "' is not a DocumentEntry");
+ }
+ return new DocumentInputStream(( DocumentEntry ) document);
+ }
/**
* create a new DocumentEntry
diff --git a/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java b/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java
index 61774dc67..7c693a5de 100644
--- a/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java
+++ b/src/java/org/apache/poi/poifs/filesystem/POIFSFileSystem.java
@@ -287,7 +287,7 @@ public class POIFSFileSystem
{
return getRoot().createDirectory(name);
}
-
+
/**
* Write the filesystem out
*
@@ -422,7 +422,7 @@ public class POIFSFileSystem
* @return the root entry
*/
- public DirectoryEntry getRoot()
+ public DirectoryNode getRoot()
{
if (_root == null)
{
@@ -446,14 +446,7 @@ public class POIFSFileSystem
final String documentName)
throws IOException
{
- Entry document = getRoot().getEntry(documentName);
-
- if (!document.isDocumentEntry())
- {
- throw new IOException("Entry '" + documentName
- + "' is not a DocumentEntry");
- }
- return new DocumentInputStream(( DocumentEntry ) document);
+ return getRoot().createDocumentInputStream(documentName);
}
/**
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java
index 955cbc5ab..af6616307 100644
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFDiagram.java
@@ -53,7 +53,7 @@ public class HDGFDiagram extends POIDocument {
private PointerFactory ptrFactory;
public HDGFDiagram(POIFSFileSystem fs) throws IOException {
- filesystem = fs;
+ super(fs);
DocumentEntry docProps =
(DocumentEntry)filesystem.getRoot().getEntry("VisioDocument");
diff --git a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java
index 12afcc49f..2c523c70a 100644
--- a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java
@@ -124,7 +124,7 @@ public class HSLFSlideShow extends POIDocument
*/
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{
- this.filesystem = filesystem;
+ super(filesystem);
// First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
index 557060aa5..a54e50de4 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@@ -29,6 +29,7 @@ import java.io.ByteArrayInputStream;
import java.util.Iterator;
import org.apache.poi.POIDocument;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.common.POIFSConstants;
@@ -95,7 +96,7 @@ public class HWPFDocument extends POIDocument
protected HWPFDocument()
{
-
+ super(null, null);
}
/**
@@ -132,7 +133,7 @@ public class HWPFDocument extends POIDocument
//do Ole stuff
this( verifyAndBuildPOIFS(istream) );
}
-
+
/**
* This constructor loads a Word document from a POIFSFileSystem
*
@@ -141,16 +142,31 @@ public class HWPFDocument extends POIDocument
* in POIFSFileSystem.
*/
public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
+ {
+ this(pfilesystem.getRoot(), pfilesystem);
+ }
+
+ /**
+ * This constructor loads a Word document from a specific point
+ * in a POIFSFileSystem, probably not the default.
+ * Used typically to open embeded documents.
+ *
+ * @param pfilesystem The POIFSFileSystem that contains the Word document.
+ * @throws IOException If there is an unexpected IOException from the passed
+ * in POIFSFileSystem.
+ */
+ public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
// Sort out the hpsf properties
- filesystem = pfilesystem;
+ super(directory, pfilesystem);
readProperties();
// read in the main stream.
- DocumentEntry documentProps =
- (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
+ DocumentEntry documentProps = (DocumentEntry)
+ directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
- filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
+
+ directory.createDocumentInputStream("WordDocument").read(_mainStream);
// use the fib to determine the name of the table stream.
_fib = new FileInformationBlock(_mainStream);
@@ -165,14 +181,14 @@ public class HWPFDocument extends POIDocument
DocumentEntry tableProps;
try {
tableProps =
- (DocumentEntry)filesystem.getRoot().getEntry(name);
+ (DocumentEntry)directory.getEntry(name);
} catch(FileNotFoundException fnfe) {
throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
}
// read in the table stream.
_tableStream = new byte[tableProps.getSize()];
- filesystem.createDocumentInputStream(name).read(_tableStream);
+ directory.createDocumentInputStream(name).read(_tableStream);
_fib.fillVariableFields(_mainStream, _tableStream);
@@ -180,7 +196,7 @@ public class HWPFDocument extends POIDocument
try
{
DocumentEntry dataProps =
- (DocumentEntry) filesystem.getRoot().getEntry("Data");
+ (DocumentEntry)directory.getEntry("Data");
_dataStream = new byte[dataProps.getSize()];
filesystem.createDocumentInputStream("Data").read(_dataStream);
}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
index cda33675f..c78ccfa32 100644
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -23,6 +23,8 @@ import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
@@ -54,12 +56,16 @@ public class TestWordExtractor extends TestCase {
private WordExtractor extractor;
// Corrupted document - can't do paragraph based stuff
private WordExtractor extractor2;
+ // A word doc embeded in an excel file
+ private String filename3;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
+ String pdirname = System.getProperty("POIFS.testdata.path");
String filename = dirname + "/test2.doc";
String filename2 = dirname + "/test.doc";
+ filename3 = pdirname + "/excel_with_embeded.xls";
extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2));
@@ -101,4 +107,25 @@ public class TestWordExtractor extends TestCase {
String text = extractor.getTextFromPieces();
assertEquals(p_text1_block, text);
}
+
+
+ /**
+ * Test that we can get data from an
+ * embeded word document
+ * @throws Exception
+ */
+ public void testExtractFromEmbeded() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
+ DirectoryNode dir = (DirectoryNode)
+ fs.getRoot().getEntry("MBD03F25D8D");
+ // Should have WordDocument and 1Table
+ assertNotNull(dir.getEntry("1Table"));
+ assertNotNull(dir.getEntry("WordDocument"));
+
+ HWPFDocument doc = new HWPFDocument(dir, fs);
+ WordExtractor extractor3 = new WordExtractor(doc);
+
+ assertNotNull(extractor3.getText());
+ assertTrue(extractor3.getText().length() > 20);
+ }
}