Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@646870 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-10 16:59:10 +00:00
parent f8c6a52a2c
commit beed69a11e
11 changed files with 107 additions and 23 deletions

View File

@ -521,6 +521,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HPSF.testdata.path"
file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="POIFS.testdata.path"
file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/>
<formatter type="xml"/>
@ -556,6 +558,8 @@ under the License.
file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="HWPF.testdata.path"
file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="POIFS.testdata.path"
file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<batchtest todir="${main.reports.test}">
@ -585,6 +589,7 @@ under the License.
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<formatter type="xml"/>
@ -601,6 +606,7 @@ under the License.
<classpath refid="test.classpath"/>
<sysproperty key="HSSF.testdata.path" file="${main.src.test}/org/apache/poi/hssf/data"/>
<sysproperty key="HPSF.testdata.path" file="${main.src.test}/org/apache/poi/hpsf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>
<test name="${testcase}"/>
@ -639,6 +645,7 @@ under the License.
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/>
<formatter type="xml"/>
@ -673,6 +680,7 @@ under the License.
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="java.awt.headless" value="true"/>
<sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/>

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.0.3-beta1" date="2008-04-??">
<action dev="POI-DEVELOPERS" type="add">Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document</action>
<action dev="POI-DEVELOPERS" type="add">Initial support for getting and changing chart and series titles</action>
<action dev="POI-DEVELOPERS" type="add">Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it</action>
<action dev="POI-DEVELOPERS" type="fix">44792 - fixed encode/decode problems in ExternalNameRecord and CRNRecord.</action>

View File

@ -29,6 +29,7 @@ import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
@ -50,6 +51,8 @@ public abstract class POIDocument {
protected DocumentSummaryInformation dsInf;
/** The open POIFS FileSystem that contains our document */
protected POIFSFileSystem filesystem;
/** The directory that our document lives in */
protected DirectoryNode directory;
/** For our own logging use */
protected POILogger logger = POILogFactory.getLogger(this.getClass());
@ -57,6 +60,15 @@ public abstract class POIDocument {
/* Have the property streams been read yet? (Only done on-demand) */
protected boolean initialized = false;
protected POIDocument(DirectoryNode dir, POIFSFileSystem fs) {
this.filesystem = fs;
this.directory = dir;
}
protected POIDocument(POIFSFileSystem fs) {
this(fs.getRoot(), fs);
}
/**
* Fetch the Document Summary Information of the document
*/
@ -110,7 +122,7 @@ public abstract class POIDocument {
DocumentInputStream dis;
try {
// Find the entry, and get an input stream for it
dis = filesystem.createDocumentInputStream(setName);
dis = directory.createDocumentInputStream(setName);
} catch(IOException ie) {
// Oh well, doesn't exist
logger.log(POILogger.WARN, "Error getting property set with name " + setName + "\n" + ie);

View File

@ -139,6 +139,7 @@ public class HSSFWorkbook extends POIDocument
protected HSSFWorkbook( Workbook book )
{
super(null, null);
workbook = book;
sheets = new ArrayList( INITIAL_CAPACITY );
names = new ArrayList( INITIAL_CAPACITY );
@ -164,8 +165,8 @@ public class HSSFWorkbook extends POIDocument
public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes)
throws IOException
{
super(fs);
this.preserveNodes = preserveNodes;
this.filesystem = fs;
// If we're not preserving nodes, don't track the
// POIFS any more

View File

@ -106,6 +106,31 @@ public class DirectoryNode
return _path;
}
/**
* open a document in the directory's entry's list of entries
*
* @param documentName the name of the document to be opened
*
* @return a newly opened DocumentInputStream
*
* @exception IOException if the document does not exist or the
* name is that of a DirectoryEntry
*/
public DocumentInputStream createDocumentInputStream(
final String documentName)
throws IOException
{
Entry document = getEntry(documentName);
if (!document.isDocumentEntry())
{
throw new IOException("Entry '" + documentName
+ "' is not a DocumentEntry");
}
return new DocumentInputStream(( DocumentEntry ) document);
}
/**
* create a new DocumentEntry
*

View File

@ -422,7 +422,7 @@ public class POIFSFileSystem
* @return the root entry
*/
public DirectoryEntry getRoot()
public DirectoryNode getRoot()
{
if (_root == null)
{
@ -446,14 +446,7 @@ public class POIFSFileSystem
final String documentName)
throws IOException
{
Entry document = getRoot().getEntry(documentName);
if (!document.isDocumentEntry())
{
throw new IOException("Entry '" + documentName
+ "' is not a DocumentEntry");
}
return new DocumentInputStream(( DocumentEntry ) document);
return getRoot().createDocumentInputStream(documentName);
}
/**

View File

@ -53,7 +53,7 @@ public class HDGFDiagram extends POIDocument {
private PointerFactory ptrFactory;
public HDGFDiagram(POIFSFileSystem fs) throws IOException {
filesystem = fs;
super(fs);
DocumentEntry docProps =
(DocumentEntry)filesystem.getRoot().getEntry("VisioDocument");

View File

@ -124,7 +124,7 @@ public class HSLFSlideShow extends POIDocument
*/
public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException
{
this.filesystem = filesystem;
super(filesystem);
// First up, grab the "Current User" stream
// We need this before we can detect Encrypted Documents

View File

@ -29,6 +29,7 @@ import java.io.ByteArrayInputStream;
import java.util.Iterator;
import org.apache.poi.POIDocument;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.common.POIFSConstants;
@ -95,7 +96,7 @@ public class HWPFDocument extends POIDocument
protected HWPFDocument()
{
super(null, null);
}
/**
@ -141,16 +142,31 @@ public class HWPFDocument extends POIDocument
* in POIFSFileSystem.
*/
public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
{
this(pfilesystem.getRoot(), pfilesystem);
}
/**
* This constructor loads a Word document from a specific point
* in a POIFSFileSystem, probably not the default.
* Used typically to open embeded documents.
*
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
// Sort out the hpsf properties
filesystem = pfilesystem;
super(directory, pfilesystem);
readProperties();
// read in the main stream.
DocumentEntry documentProps =
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
DocumentEntry documentProps = (DocumentEntry)
directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
directory.createDocumentInputStream("WordDocument").read(_mainStream);
// use the fib to determine the name of the table stream.
_fib = new FileInformationBlock(_mainStream);
@ -165,14 +181,14 @@ public class HWPFDocument extends POIDocument
DocumentEntry tableProps;
try {
tableProps =
(DocumentEntry)filesystem.getRoot().getEntry(name);
(DocumentEntry)directory.getEntry(name);
} catch(FileNotFoundException fnfe) {
throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
}
// read in the table stream.
_tableStream = new byte[tableProps.getSize()];
filesystem.createDocumentInputStream(name).read(_tableStream);
directory.createDocumentInputStream(name).read(_tableStream);
_fib.fillVariableFields(_mainStream, _tableStream);
@ -180,7 +196,7 @@ public class HWPFDocument extends POIDocument
try
{
DocumentEntry dataProps =
(DocumentEntry) filesystem.getRoot().getEntry("Data");
(DocumentEntry)directory.getEntry("Data");
_dataStream = new byte[dataProps.getSize()];
filesystem.createDocumentInputStream("Data").read(_dataStream);
}

View File

@ -23,6 +23,8 @@ import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
@ -54,12 +56,16 @@ public class TestWordExtractor extends TestCase {
private WordExtractor extractor;
// Corrupted document - can't do paragraph based stuff
private WordExtractor extractor2;
// A word doc embeded in an excel file
private String filename3;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
String pdirname = System.getProperty("POIFS.testdata.path");
String filename = dirname + "/test2.doc";
String filename2 = dirname + "/test.doc";
filename3 = pdirname + "/excel_with_embeded.xls";
extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2));
@ -101,4 +107,25 @@ public class TestWordExtractor extends TestCase {
String text = extractor.getTextFromPieces();
assertEquals(p_text1_block, text);
}
/**
* Test that we can get data from an
* embeded word document
* @throws Exception
*/
public void testExtractFromEmbeded() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
DirectoryNode dir = (DirectoryNode)
fs.getRoot().getEntry("MBD03F25D8D");
// Should have WordDocument and 1Table
assertNotNull(dir.getEntry("1Table"));
assertNotNull(dir.getEntry("WordDocument"));
HWPFDocument doc = new HWPFDocument(dir, fs);
WordExtractor extractor3 = new WordExtractor(doc);
assertNotNull(extractor3.getText());
assertTrue(extractor3.getText().length() > 20);
}
}