support for text extraction from PPT master slides, see Bugzilla 48161
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@835271 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4a57314dcc
commit
e8e2b1ff49
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.6-beta1" date="2009-??-??">
|
<release version="3.6-beta1" date="2009-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">48161 - support for text extraction from PPT master slides</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">47970 - added a method to set arabic mode in HSSFSheet</action>
|
<action dev="POI-DEVELOPERS" type="add">47970 - added a method to set arabic mode in HSSFSheet</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48134 - release system resources when using Picture.resize()</action>
|
<action dev="POI-DEVELOPERS" type="fix">48134 - release system resources when using Picture.resize()</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">48087 - avoid NPE in XSSFChartSheet when calling methods of the superclass</action>
|
<action dev="POI-DEVELOPERS" type="fix">48087 - avoid NPE in XSSFChartSheet when calling methods of the superclass</action>
|
||||||
|
@ -45,6 +45,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
private boolean _slidesByDefault = true;
|
private boolean _slidesByDefault = true;
|
||||||
private boolean _notesByDefault = false;
|
private boolean _notesByDefault = false;
|
||||||
private boolean _commentsByDefault = false;
|
private boolean _commentsByDefault = false;
|
||||||
|
private boolean _masterByDefault = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Basic extractor. Returns all the text, and optionally all the notes
|
* Basic extractor. Returns all the text, and optionally all the notes
|
||||||
@ -58,6 +59,8 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
|
|
||||||
boolean notes = false;
|
boolean notes = false;
|
||||||
boolean comments = false;
|
boolean comments = false;
|
||||||
|
boolean master = true;
|
||||||
|
|
||||||
String file;
|
String file;
|
||||||
if (args.length > 1) {
|
if (args.length > 1) {
|
||||||
notes = true;
|
notes = true;
|
||||||
@ -70,7 +73,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
PowerPointExtractor ppe = new PowerPointExtractor(file);
|
PowerPointExtractor ppe = new PowerPointExtractor(file);
|
||||||
System.out.println(ppe.getText(true, notes, comments));
|
System.out.println(ppe.getText(true, notes, comments, master));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -137,12 +140,19 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
this._commentsByDefault = commentsByDefault;
|
this._commentsByDefault = commentsByDefault;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should a call to getText() return text from master? Default is no
|
||||||
|
*/
|
||||||
|
public void setMasterByDefault(boolean masterByDefault) {
|
||||||
|
this._masterByDefault = masterByDefault;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches all the slide text from the slideshow, but not the notes, unless
|
* Fetches all the slide text from the slideshow, but not the notes, unless
|
||||||
* you've called setSlidesByDefault() and setNotesByDefault() to change this
|
* you've called setSlidesByDefault() and setNotesByDefault() to change this
|
||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
return getText(_slidesByDefault, _notesByDefault, _commentsByDefault);
|
return getText(_slidesByDefault, _notesByDefault, _commentsByDefault, _masterByDefault);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -178,14 +188,20 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
* @param getNoteText fetch note text
|
* @param getNoteText fetch note text
|
||||||
*/
|
*/
|
||||||
public String getText(boolean getSlideText, boolean getNoteText) {
|
public String getText(boolean getSlideText, boolean getNoteText) {
|
||||||
return getText(getSlideText, getNoteText, _commentsByDefault);
|
return getText(getSlideText, getNoteText, _commentsByDefault, _masterByDefault);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText) {
|
public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
|
||||||
StringBuffer ret = new StringBuffer();
|
StringBuffer ret = new StringBuffer();
|
||||||
|
|
||||||
if (getSlideText) {
|
if (getSlideText) {
|
||||||
for (int i = 0; i < _slides.length; i++) {
|
if (getMasterText) {
|
||||||
|
for (SlideMaster master : _show.getSlidesMasters()) {
|
||||||
|
textRunsToText(ret, master.getTextRuns());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < _slides.length; i++) {
|
||||||
Slide slide = _slides[i];
|
Slide slide = _slides[i];
|
||||||
|
|
||||||
// Slide header, if set
|
// Slide header, if set
|
||||||
@ -195,19 +211,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Slide text
|
// Slide text
|
||||||
TextRun[] runs = slide.getTextRuns();
|
textRunsToText(ret, slide.getTextRuns());
|
||||||
for (int j = 0; j < runs.length; j++) {
|
|
||||||
TextRun run = runs[j];
|
|
||||||
if (run != null) {
|
|
||||||
String text = run.getText();
|
|
||||||
ret.append(text);
|
|
||||||
if (!text.endsWith("\n")) {
|
|
||||||
ret.append("\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Slide footer, if set
|
// Slide footer, if set
|
||||||
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
|
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
|
||||||
ret.append(hf.getFooterText() + "\n");
|
ret.append(hf.getFooterText() + "\n");
|
||||||
}
|
}
|
||||||
@ -249,17 +255,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Notes text
|
// Notes text
|
||||||
TextRun[] runs = notes.getTextRuns();
|
textRunsToText(ret, notes.getTextRuns());
|
||||||
if (runs != null && runs.length > 0) {
|
|
||||||
for (int j = 0; j < runs.length; j++) {
|
|
||||||
TextRun run = runs[j];
|
|
||||||
String text = run.getText();
|
|
||||||
ret.append(text);
|
|
||||||
if (!text.endsWith("\n")) {
|
|
||||||
ret.append("\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeat the notes footer, if set
|
// Repeat the notes footer, if set
|
||||||
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
|
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
|
||||||
@ -270,4 +266,21 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||||||
|
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void textRunsToText(StringBuffer ret, TextRun[] runs) {
|
||||||
|
if (runs==null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < runs.length; j++) {
|
||||||
|
TextRun run = runs[j];
|
||||||
|
if (run != null) {
|
||||||
|
String text = run.getText();
|
||||||
|
ret.append(text);
|
||||||
|
if (!text.endsWith("\n")) {
|
||||||
|
ret.append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,6 @@ public final class TestExtractor extends TestCase {
|
|||||||
//private String pdirname;
|
//private String pdirname;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
|
|
||||||
ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt"));
|
ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt"));
|
||||||
ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt"));
|
ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt"));
|
||||||
}
|
}
|
||||||
@ -63,7 +62,7 @@ public final class TestExtractor extends TestCase {
|
|||||||
|
|
||||||
// 1 page example with text boxes
|
// 1 page example with text boxes
|
||||||
sheetText = ppe2.getText();
|
sheetText = ppe2.getText();
|
||||||
expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
|
expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
|
||||||
|
|
||||||
ensureTwoStringsTheSame(expectText, sheetText);
|
ensureTwoStringsTheSame(expectText, sheetText);
|
||||||
}
|
}
|
||||||
@ -112,7 +111,7 @@ public final class TestExtractor extends TestCase {
|
|||||||
*/
|
*/
|
||||||
public void testMissingCoreRecords() throws Exception {
|
public void testMissingCoreRecords() throws Exception {
|
||||||
ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt"));
|
ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt"));
|
||||||
|
|
||||||
String text = ppe.getText(true, false);
|
String text = ppe.getText(true, false);
|
||||||
String nText = ppe.getNotes();
|
String nText = ppe.getNotes();
|
||||||
|
|
||||||
@ -265,4 +264,13 @@ public final class TestExtractor extends TestCase {
|
|||||||
private static boolean contains(String text, String searchString) {
|
private static boolean contains(String text, String searchString) {
|
||||||
return text.indexOf(searchString) >=0;
|
return text.indexOf(searchString) >=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMasterText() throws Exception {
|
||||||
|
ppe = new PowerPointExtractor(slTests.openResourceAsStream("master_text.ppt"));
|
||||||
|
ppe.setMasterByDefault(true);
|
||||||
|
|
||||||
|
String text = ppe.getText();
|
||||||
|
assertTrue(text.contains("Master Header Text"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
BIN
test-data/slideshow/master_text.ppt
Normal file
BIN
test-data/slideshow/master_text.ppt
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user