Support for getting OLE object data in PowerPointExtractor, see Bugzilla 47456

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@791241 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-07-05 13:15:41 +00:00
parent efc5431030
commit 9c5c51ad29
3 changed files with 50 additions and 50 deletions

View File

@ -33,6 +33,7 @@
<changes>
<release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>

View File

@ -21,14 +21,12 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.model.HeadersFooters;
import org.apache.poi.hslf.model.Notes;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.*;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -151,7 +149,24 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
return getText(false,true);
}
/**
public List<OLEShape> getOLEShapes() {
List<OLEShape> list = new ArrayList<OLEShape>();
for (int i = 0; i < _slides.length; i++) {
Slide slide = _slides[i];
Shape[] shapes = slide.getShapes();
for (int j = 0; j < shapes.length; j++) {
if (shapes[j] instanceof OLEShape) {
list.add((OLEShape) shapes[j]);
}
}
}
return list;
}
/**
* Fetches text from the slideshow, be it slide text or note text.
* Because the final block of text in a TextRun normally have their
* last \n stripped, we add it back

View File

@ -18,11 +18,16 @@
package org.apache.poi.hslf.extractor;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.OLEShape;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import junit.framework.TestCase;
@ -167,51 +172,30 @@ public final class TestExtractor extends TestCase {
/**
* A powerpoint file with embeded powerpoint files
* TODO - figure out how to handle this, as ppt
* appears to embed not as ole2 streams
*/
public void DISABLEDtestExtractFromOwnEmbeded() throws Exception {
String filename3 = pdirname + "/ppt_with_embeded.ppt";
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(filename3)
);
HSLFSlideShow ss;
DirectoryNode dirA = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B6");
DirectoryNode dirB = (DirectoryNode)
fs.getRoot().getEntry("MBD0000A3B3");
assertNotNull(dirA.getEntry("PowerPoint Document"));
assertNotNull(dirB.getEntry("PowerPoint Document"));
// Check the first file
ss = new HSLFSlideShow(dirA, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
ppe.getText(true, false)
);
// And the second
ss = new HSLFSlideShow(dirB, fs);
ppe = new PowerPointExtractor(ss);
assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
ppe.getText(true, false)
);
// Check the master doc two ways
ss = new HSLFSlideShow(fs.getRoot(), fs);
ppe = new PowerPointExtractor(ss);
assertEquals("I have embeded files in me\n",
ppe.getText(true, false)
);
ss = new HSLFSlideShow(fs);
ppe = new PowerPointExtractor(ss);
assertEquals("I have embeded files in me\n",
ppe.getText(true, false)
);
public void testExtractFromOwnEmbeded() throws Exception {
String path = pdirname + "/ppt_with_embeded.ppt";
ppe = new PowerPointExtractor(path);
List<OLEShape> shapes = ppe.getOLEShapes();
assertEquals("Expected 6 ole shapes in " + path, 6, shapes.size());
int num_ppt = 0, num_doc = 0, num_xls = 0;
for(OLEShape ole : shapes) {
String name = ole.getInstanceName();
InputStream data = ole.getObjectData().getData();
if ("Worksheet".equals(name)) {
HSSFWorkbook wb = new HSSFWorkbook(data);
num_xls++;
} else if ("Document".equals(name)) {
HWPFDocument doc = new HWPFDocument(data);
num_doc++;
} else if ("Presentation".equals(name)) {
num_ppt++;
SlideShow ppt = new SlideShow(data);
}
}
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
}
/**