Support for getting OLE object data in PowerPointExtractor, see Bugzilla 47456
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@791241 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
efc5431030
commit
9c5c51ad29
@ -33,6 +33,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.5-beta7" date="2009-??-??">
|
<release version="3.5-beta7" date="2009-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
|
<action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
|
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
|
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
|
||||||
|
@ -21,14 +21,12 @@ import java.io.FileInputStream;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hslf.HSLFSlideShow;
|
import org.apache.poi.hslf.HSLFSlideShow;
|
||||||
import org.apache.poi.hslf.model.Comment;
|
import org.apache.poi.hslf.model.*;
|
||||||
import org.apache.poi.hslf.model.HeadersFooters;
|
|
||||||
import org.apache.poi.hslf.model.Notes;
|
|
||||||
import org.apache.poi.hslf.model.Slide;
|
|
||||||
import org.apache.poi.hslf.model.TextRun;
|
|
||||||
import org.apache.poi.hslf.usermodel.SlideShow;
|
import org.apache.poi.hslf.usermodel.SlideShow;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
@ -151,7 +149,24 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
return getText(false,true);
|
return getText(false,true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public List<OLEShape> getOLEShapes() {
|
||||||
|
List<OLEShape> list = new ArrayList<OLEShape>();
|
||||||
|
|
||||||
|
for (int i = 0; i < _slides.length; i++) {
|
||||||
|
Slide slide = _slides[i];
|
||||||
|
|
||||||
|
Shape[] shapes = slide.getShapes();
|
||||||
|
for (int j = 0; j < shapes.length; j++) {
|
||||||
|
if (shapes[j] instanceof OLEShape) {
|
||||||
|
list.add((OLEShape) shapes[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
* Fetches text from the slideshow, be it slide text or note text.
|
* Fetches text from the slideshow, be it slide text or note text.
|
||||||
* Because the final block of text in a TextRun normally have their
|
* Because the final block of text in a TextRun normally have their
|
||||||
* last \n stripped, we add it back
|
* last \n stripped, we add it back
|
||||||
|
@ -18,11 +18,16 @@
|
|||||||
package org.apache.poi.hslf.extractor;
|
package org.apache.poi.hslf.extractor;
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.hslf.HSLFSlideShow;
|
import org.apache.poi.hslf.HSLFSlideShow;
|
||||||
|
import org.apache.poi.hslf.model.OLEShape;
|
||||||
import org.apache.poi.hslf.usermodel.SlideShow;
|
import org.apache.poi.hslf.usermodel.SlideShow;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
@ -167,51 +172,30 @@ public final class TestExtractor extends TestCase {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* A powerpoint file with embeded powerpoint files
|
* A powerpoint file with embeded powerpoint files
|
||||||
* TODO - figure out how to handle this, as ppt
|
|
||||||
* appears to embed not as ole2 streams
|
|
||||||
*/
|
*/
|
||||||
public void DISABLEDtestExtractFromOwnEmbeded() throws Exception {
|
public void testExtractFromOwnEmbeded() throws Exception {
|
||||||
String filename3 = pdirname + "/ppt_with_embeded.ppt";
|
String path = pdirname + "/ppt_with_embeded.ppt";
|
||||||
POIFSFileSystem fs = new POIFSFileSystem(
|
ppe = new PowerPointExtractor(path);
|
||||||
new FileInputStream(filename3)
|
List<OLEShape> shapes = ppe.getOLEShapes();
|
||||||
);
|
assertEquals("Expected 6 ole shapes in " + path, 6, shapes.size());
|
||||||
HSLFSlideShow ss;
|
int num_ppt = 0, num_doc = 0, num_xls = 0;
|
||||||
|
for(OLEShape ole : shapes) {
|
||||||
DirectoryNode dirA = (DirectoryNode)
|
String name = ole.getInstanceName();
|
||||||
fs.getRoot().getEntry("MBD0000A3B6");
|
InputStream data = ole.getObjectData().getData();
|
||||||
DirectoryNode dirB = (DirectoryNode)
|
if ("Worksheet".equals(name)) {
|
||||||
fs.getRoot().getEntry("MBD0000A3B3");
|
HSSFWorkbook wb = new HSSFWorkbook(data);
|
||||||
|
num_xls++;
|
||||||
assertNotNull(dirA.getEntry("PowerPoint Document"));
|
} else if ("Document".equals(name)) {
|
||||||
assertNotNull(dirB.getEntry("PowerPoint Document"));
|
HWPFDocument doc = new HWPFDocument(data);
|
||||||
|
num_doc++;
|
||||||
// Check the first file
|
} else if ("Presentation".equals(name)) {
|
||||||
ss = new HSLFSlideShow(dirA, fs);
|
num_ppt++;
|
||||||
ppe = new PowerPointExtractor(ss);
|
SlideShow ppt = new SlideShow(data);
|
||||||
assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
|
}
|
||||||
ppe.getText(true, false)
|
}
|
||||||
);
|
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
|
||||||
|
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
|
||||||
// And the second
|
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
|
||||||
ss = new HSLFSlideShow(dirB, fs);
|
|
||||||
ppe = new PowerPointExtractor(ss);
|
|
||||||
assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
|
|
||||||
ppe.getText(true, false)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
// Check the master doc two ways
|
|
||||||
ss = new HSLFSlideShow(fs.getRoot(), fs);
|
|
||||||
ppe = new PowerPointExtractor(ss);
|
|
||||||
assertEquals("I have embeded files in me\n",
|
|
||||||
ppe.getText(true, false)
|
|
||||||
);
|
|
||||||
|
|
||||||
ss = new HSLFSlideShow(fs);
|
|
||||||
ppe = new PowerPointExtractor(ss);
|
|
||||||
assertEquals("I have embeded files in me\n",
|
|
||||||
ppe.getText(true, false)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
Reference in New Issue
Block a user