diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index dfa706680..3fcb9fedc 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47456 - Support for getting OLE object data in PowerPointExtractor 47411 - Explicitly set the 1900 date system when creating XSSF workbooks 47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF 47415 - Fixed PageSettingsBlock to allow multiple PLS records diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index 5e6cad510..a8dbfa2a9 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -21,14 +21,12 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; +import java.util.List; +import java.util.ArrayList; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hslf.HSLFSlideShow; -import org.apache.poi.hslf.model.Comment; -import org.apache.poi.hslf.model.HeadersFooters; -import org.apache.poi.hslf.model.Notes; -import org.apache.poi.hslf.model.Slide; -import org.apache.poi.hslf.model.TextRun; +import org.apache.poi.hslf.model.*; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -151,7 +149,24 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { return getText(false,true); } - /** + public List getOLEShapes() { + List list = new ArrayList(); + + for (int i = 0; i < _slides.length; i++) { + Slide slide = _slides[i]; + + Shape[] shapes = slide.getShapes(); + for (int j = 0; j < shapes.length; j++) { + if (shapes[j] instanceof OLEShape) { + list.add((OLEShape) shapes[j]); + } + } + } + + return list; + } + + /** * Fetches text from the slideshow, be it slide text or note text. * Because the final block of text in a TextRun normally have their * last \n stripped, we add it back diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java index 9c213d477..769638bf0 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java @@ -18,11 +18,16 @@ package org.apache.poi.hslf.extractor; import java.io.FileInputStream; +import java.io.InputStream; +import java.util.List; import org.apache.poi.hslf.HSLFSlideShow; +import org.apache.poi.hslf.model.OLEShape; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.hwpf.HWPFDocument; import junit.framework.TestCase; @@ -167,51 +172,30 @@ public final class TestExtractor extends TestCase { /** * A powerpoint file with embeded powerpoint files - * TODO - figure out how to handle this, as ppt - * appears to embed not as ole2 streams */ - public void DISABLEDtestExtractFromOwnEmbeded() throws Exception { - String filename3 = pdirname + "/ppt_with_embeded.ppt"; - POIFSFileSystem fs = new POIFSFileSystem( - new FileInputStream(filename3) - ); - HSLFSlideShow ss; - - DirectoryNode dirA = (DirectoryNode) - fs.getRoot().getEntry("MBD0000A3B6"); - DirectoryNode dirB = (DirectoryNode) - fs.getRoot().getEntry("MBD0000A3B3"); - - assertNotNull(dirA.getEntry("PowerPoint Document")); - assertNotNull(dirB.getEntry("PowerPoint Document")); - - // Check the first file - ss = new HSLFSlideShow(dirA, fs); - ppe = new PowerPointExtractor(ss); - assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n", - ppe.getText(true, false) - ); - - // And the second - ss = new HSLFSlideShow(dirB, fs); - ppe = new PowerPointExtractor(ss); - assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n", - ppe.getText(true, false) - ); - - - // Check the master doc two ways - ss = new HSLFSlideShow(fs.getRoot(), fs); - ppe = new PowerPointExtractor(ss); - assertEquals("I have embeded files in me\n", - ppe.getText(true, false) - ); - - ss = new HSLFSlideShow(fs); - ppe = new PowerPointExtractor(ss); - assertEquals("I have embeded files in me\n", - ppe.getText(true, false) - ); + public void testExtractFromOwnEmbeded() throws Exception { + String path = pdirname + "/ppt_with_embeded.ppt"; + ppe = new PowerPointExtractor(path); + List shapes = ppe.getOLEShapes(); + assertEquals("Expected 6 ole shapes in " + path, 6, shapes.size()); + int num_ppt = 0, num_doc = 0, num_xls = 0; + for(OLEShape ole : shapes) { + String name = ole.getInstanceName(); + InputStream data = ole.getObjectData().getData(); + if ("Worksheet".equals(name)) { + HSSFWorkbook wb = new HSSFWorkbook(data); + num_xls++; + } else if ("Document".equals(name)) { + HWPFDocument doc = new HWPFDocument(data); + num_doc++; + } else if ("Presentation".equals(name)) { + num_ppt++; + SlideShow ppt = new SlideShow(data); + } + } + assertEquals("Expected 2 embedded Word Documents", 2, num_doc); + assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls); + assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt); } /**