diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 2bfb46d0e..1cd4891fd 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document Initial support for getting and changing chart and series titles Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 9d50f53ed..33bfce8ce 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Update HSLFSlideShow and HSSFWorkbook to take advantage of POIFS updates, and allow reading embeded documents Improve how POIFS works with directory entries, and update HWPFDocument to support reading an embeded word document Initial support for getting and changing chart and series titles Implement a proxy HSSFListener which tracks the format records, and lets you lookup the format string for a given cell. Convert the xls to csv example to use it diff --git a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java index 1eb4f7c8c..ee5cfdb9f 100644 --- a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java +++ b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java @@ -161,17 +161,37 @@ public class HSSFWorkbook extends POIDocument * @see org.apache.poi.poifs.filesystem.POIFSFileSystem * @exception IOException if the stream cannot be read */ - public HSSFWorkbook(POIFSFileSystem fs, boolean preserveNodes) throws IOException { - super(fs); + this(fs.getRoot(), fs, preserveNodes); + } + + /** + * given a POI POIFSFileSystem object, and a specific directory + * within it, read in its Workbook and populate the high and + * low level models. If you're reading in a workbook...start here. + * + * @param directory the POI filesystem directory to process from + * @param fs the POI filesystem that contains the Workbook stream. + * @param preserveNodes whether to preseve other nodes, such as + * macros. This takes more memory, so only say yes if you + * need to. If set, will store all of the POIFSFileSystem + * in memory + * @see org.apache.poi.poifs.filesystem.POIFSFileSystem + * @exception IOException if the stream cannot be read + */ + public HSSFWorkbook(DirectoryNode directory, POIFSFileSystem fs, boolean preserveNodes) + throws IOException + { + super(directory, fs); this.preserveNodes = preserveNodes; // If we're not preserving nodes, don't track the // POIFS any more if(! preserveNodes) { this.filesystem = null; + this.directory = null; } sheets = new ArrayList(INITIAL_CAPACITY); @@ -182,13 +202,13 @@ public class HSSFWorkbook extends POIDocument // put theirs in one called "WORKBOOK" String workbookName = "Workbook"; try { - fs.getRoot().getEntry(workbookName); + directory.getEntry(workbookName); // Is the default name } catch(FileNotFoundException fe) { // Try the upper case form try { workbookName = "WORKBOOK"; - fs.getRoot().getEntry(workbookName); + directory.getEntry(workbookName); } catch(FileNotFoundException wfe) { // Doesn't contain it in either form throw new IllegalArgumentException("The supplied POIFSFileSystem contained neither a 'Workbook' entry, nor a 'WORKBOOK' entry. Is it really an excel file?"); @@ -198,7 +218,7 @@ public class HSSFWorkbook extends POIDocument // Grab the data from the workbook stream, however // it happens to be spelt. - InputStream stream = fs.createDocumentInputStream(workbookName); + InputStream stream = directory.createDocumentInputStream(workbookName); EventRecordFactory factory = new EventRecordFactory(); diff --git a/src/java/org/apache/poi/poifs/dev/POIFSLister.java b/src/java/org/apache/poi/poifs/dev/POIFSLister.java new file mode 100644 index 000000000..c9fa349d6 --- /dev/null +++ b/src/java/org/apache/poi/poifs/dev/POIFSLister.java @@ -0,0 +1,81 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.poifs.dev; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Iterator; + +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * A lister of the entries in POIFS files. + * + * Much simpler than {@link POIFSViewer} + */ +public class POIFSLister { + /** + * Display the entries of multiple POIFS files + * + * @param args the names of the files to be displayed + */ + public static void main(final String args[]) throws IOException { + if (args.length == 0) + { + System.err.println("Must specify at least one file to view"); + System.exit(1); + } + + for (int j = 0; j < args.length; j++) + { + viewFile(args[ j ]); + } + } + + public static void viewFile(final String filename) throws IOException + { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(filename) + ); + displayDirectory(fs.getRoot(), ""); + } + + public static void displayDirectory(DirectoryNode dir, String indent) { + System.out.println(indent + dir.getName() + " -"); + String newIndent = indent + " "; + + for(Iterator it = dir.getEntries(); it.hasNext(); ) { + Object entry = it.next(); + if(entry instanceof DirectoryNode) { + displayDirectory((DirectoryNode)entry, newIndent); + } else { + DocumentNode doc = (DocumentNode)entry; + String name = doc.getName(); + if(name.charAt(0) < 10) { + String altname = "(0x0" + (int)name.charAt(0) + ")" + name.substring(1); + name = name.substring(1) + " <" + altname + ">"; + } + System.out.println(newIndent + name); + } + } + } +} \ No newline at end of file diff --git a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java index 2c523c70a..716651d1c 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java +++ b/src/scratchpad/src/org/apache/poi/hslf/HSLFSlideShow.java @@ -45,6 +45,7 @@ import org.apache.poi.hslf.record.Record; import org.apache.poi.hslf.record.UserEditAtom; import org.apache.poi.hslf.usermodel.ObjectData; import org.apache.poi.hslf.usermodel.PictureData; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -124,7 +125,21 @@ public class HSLFSlideShow extends POIDocument */ public HSLFSlideShow(POIFSFileSystem filesystem) throws IOException { - super(filesystem); + this(filesystem.getRoot(), filesystem); + } + + /** + * Constructs a Powerpoint document from a specific point in a + * POIFS Filesystem. Parses the document and places all the + * important stuff into data structures. + * + * @param dir the POIFS directory to read from + * @param filesystem the POIFS FileSystem to read from + * @throws IOException if there is a problem while parsing the document. + */ + public HSLFSlideShow(DirectoryNode dir, POIFSFileSystem filesystem) throws IOException + { + super(dir, filesystem); // First up, grab the "Current User" stream // We need this before we can detect Encrypted Documents @@ -186,11 +201,11 @@ public class HSLFSlideShow extends POIDocument { // Get the main document stream DocumentEntry docProps = - (DocumentEntry)filesystem.getRoot().getEntry("PowerPoint Document"); + (DocumentEntry)directory.getEntry("PowerPoint Document"); // Grab the document stream _docstream = new byte[docProps.getSize()]; - filesystem.createDocumentInputStream("PowerPoint Document").read(_docstream); + directory.createDocumentInputStream("PowerPoint Document").read(_docstream); } /** @@ -272,7 +287,7 @@ public class HSLFSlideShow extends POIDocument */ private void readCurrentUserStream() { try { - currentUser = new CurrentUserAtom(filesystem); + currentUser = new CurrentUserAtom(directory); } catch(IOException ie) { logger.log(POILogger.ERROR, "Error finding Current User Atom:\n" + ie); currentUser = new CurrentUserAtom(); @@ -293,9 +308,9 @@ public class HSLFSlideShow extends POIDocument byte[] pictstream; try { - DocumentEntry entry = (DocumentEntry)filesystem.getRoot().getEntry("Pictures"); + DocumentEntry entry = (DocumentEntry)directory.getEntry("Pictures"); pictstream = new byte[entry.getSize()]; - DocumentInputStream is = filesystem.createDocumentInputStream("Pictures"); + DocumentInputStream is = directory.createDocumentInputStream("Pictures"); is.read(pictstream); } catch (FileNotFoundException e){ // Silently catch exceptions if the presentation doesn't diff --git a/src/scratchpad/src/org/apache/poi/hslf/record/CurrentUserAtom.java b/src/scratchpad/src/org/apache/poi/hslf/record/CurrentUserAtom.java index 91a548523..e0810dbca 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/record/CurrentUserAtom.java +++ b/src/scratchpad/src/org/apache/poi/hslf/record/CurrentUserAtom.java @@ -93,9 +93,15 @@ public class CurrentUserAtom * Find the Current User in the filesystem, and create from that */ public CurrentUserAtom(POIFSFileSystem fs) throws IOException { + this(fs.getRoot()); + } + /** + * Find the Current User in the filesystem, and create from that + */ + public CurrentUserAtom(DirectoryNode dir) throws IOException { // Decide how big it is DocumentEntry docProps = - (DocumentEntry)fs.getRoot().getEntry("Current User"); + (DocumentEntry)dir.getEntry("Current User"); _contents = new byte[docProps.getSize()]; // Check it's big enough - if it's not at least 28 bytes long, then @@ -105,7 +111,7 @@ public class CurrentUserAtom } // Grab the contents - InputStream in = fs.createDocumentInputStream("Current User"); + InputStream in = dir.createDocumentInputStream("Current User"); in.read(_contents); // Set everything up diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TextExtractor.java index f8618ff0b..d6197a598 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TextExtractor.java @@ -21,6 +21,12 @@ package org.apache.poi.hslf.extractor; +import java.io.FileInputStream; + +import org.apache.poi.hslf.HSLFSlideShow; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + import junit.framework.TestCase; /** @@ -35,6 +41,8 @@ public class TextExtractor extends TestCase { private PowerPointExtractor ppe2; /** Where to go looking for our test files */ private String dirname; + /** Where our embeded files live */ + private String pdirname; public TextExtractor() throws Exception { dirname = System.getProperty("HSLF.testdata.path"); @@ -42,6 +50,8 @@ public class TextExtractor extends TestCase { ppe = new PowerPointExtractor(filename); String filename2 = dirname + "/with_textbox.ppt"; ppe2 = new PowerPointExtractor(filename2); + + pdirname = System.getProperty("POIFS.testdata.path"); } public void testReadSheetText() throws Exception { @@ -123,9 +133,87 @@ public class TextExtractor extends TestCase { char[] expC = exp.toCharArray(); char[] actC = act.toCharArray(); for(int i=0; i 20); + assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", + extractor3.getText()); + assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle()); + assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject()); + + + doc = new HWPFDocument(dirB, fs); + extractor3 = new WordExtractor(doc); + + assertNotNull(extractor3.getText()); + assertTrue(extractor3.getText().length() > 20); + assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", + extractor3.getText()); + assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle()); + assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject()); } } diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index 80611e6e6..ad311eb27 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -17,12 +17,15 @@ package org.apache.poi.hssf.extractor; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import junit.framework.TestCase; import org.apache.poi.hssf.HSSFTestDataSamples; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * @@ -118,4 +121,72 @@ public final class TestExcelExtractor extends TestCase { assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText()); } + + /** + * Embded in a non-excel file + */ + public void testWithEmbeded() throws Exception { + String pdirname = System.getProperty("POIFS.testdata.path"); + String filename = pdirname + "/word_with_embeded.doc"; + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(filename) + ); + + DirectoryNode objPool = (DirectoryNode) + fs.getRoot().getEntry("ObjectPool"); + DirectoryNode dirA = (DirectoryNode) + objPool.getEntry("_1269427460"); + DirectoryNode dirB = (DirectoryNode) + objPool.getEntry("_1269427461"); + + HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true); + HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true); + + ExcelExtractor exA = new ExcelExtractor(wbA); + ExcelExtractor exB = new ExcelExtractor(wbB); + + assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", + exA.getText()); + assertEquals("Sample Excel", exA.getSummaryInformation().getTitle()); + + assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", + exB.getText()); + assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle()); + } + + /** + * Excel embeded in excel + */ + public void testWithEmbededInOwn() throws Exception { + String pdirname = System.getProperty("POIFS.testdata.path"); + String filename = pdirname + "/excel_with_embeded.xls"; + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(filename) + ); + + DirectoryNode dirA = (DirectoryNode) + fs.getRoot().getEntry("MBD0000A3B5"); + DirectoryNode dirB = (DirectoryNode) + fs.getRoot().getEntry("MBD0000A3B4"); + + HSSFWorkbook wbA = new HSSFWorkbook(dirA, fs, true); + HSSFWorkbook wbB = new HSSFWorkbook(dirB, fs, true); + + ExcelExtractor exA = new ExcelExtractor(wbA); + ExcelExtractor exB = new ExcelExtractor(wbB); + + assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", + exA.getText()); + assertEquals("Sample Excel", exA.getSummaryInformation().getTitle()); + + assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", + exB.getText()); + assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle()); + + // And the base file too + ExcelExtractor ex = new ExcelExtractor(fs); + assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", + ex.getText()); + assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle()); + } } diff --git a/src/testcases/org/apache/poi/poifs/data/ppt_with_embeded.ppt b/src/testcases/org/apache/poi/poifs/data/ppt_with_embeded.ppt index e80fa2f43..588dcefe9 100644 Binary files a/src/testcases/org/apache/poi/poifs/data/ppt_with_embeded.ppt and b/src/testcases/org/apache/poi/poifs/data/ppt_with_embeded.ppt differ