Support fo text extraction of footnotes, endnotes and comments in HWPF, see Bugzilla 47400
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@788949 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0a044cce25
commit
12806cceaa
@ -33,6 +33,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.5-beta7" date="2009-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">47412 - Fixed concurrency issue with EscherProperties.initProps()</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">47143 - Fixed OOM in HSSFWorkbook#getAllPictures when reading .xls files containing metafiles</action>
|
||||
|
@ -344,6 +344,28 @@ public final class HWPFDocument extends POIDocument
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the range which covers all the Endnotes.
|
||||
*/
|
||||
public Range getEndnoteRange() {
|
||||
return new Range(
|
||||
_cpSplit.getEndNoteStart(),
|
||||
_cpSplit.getEndNoteEnd(),
|
||||
this
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the range which covers all the Endnotes.
|
||||
*/
|
||||
public Range getCommentsRange() {
|
||||
return new Range(
|
||||
_cpSplit.getCommentsStart(),
|
||||
_cpSplit.getCommentsEnd(),
|
||||
this
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the range which covers all "Header Stories".
|
||||
* A header story contains a header, footer, end note
|
||||
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
@ -95,34 +96,58 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
* Get the text from the word file, as an array with one String
|
||||
* per paragraph
|
||||
*/
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
|
||||
// Extract using the model code
|
||||
try {
|
||||
Range r = doc.getRange();
|
||||
// Extract using the model code
|
||||
try {
|
||||
Range r = doc.getRange();
|
||||
|
||||
ret = new String[r.numParagraphs()];
|
||||
for(int i=0; i<ret.length; i++) {
|
||||
Paragraph p = r.getParagraph(i);
|
||||
ret[i] = p.text();
|
||||
ret = getParagraphText(r);
|
||||
} catch (Exception e) {
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[1];
|
||||
ret[0] = getTextFromPieces();
|
||||
}
|
||||
|
||||
// Fix the line ending
|
||||
if(ret[i].endsWith("\r")) {
|
||||
ret[i] = ret[i] + "\n";
|
||||
}
|
||||
}
|
||||
} catch(Exception e) {
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[1];
|
||||
ret[0] = getTextFromPieces();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
public String[] getFootnoteText() {
|
||||
Range r = doc.getFootnoteRange();
|
||||
|
||||
/**
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
public String[] getEndnoteText() {
|
||||
Range r = doc.getEndnoteRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
public String[] getCommentsText() {
|
||||
Range r = doc.getCommentsRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
private String[] getParagraphText(Range r) {
|
||||
String[] ret;
|
||||
ret = new String[r.numParagraphs()];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
Paragraph p = r.getParagraph(i);
|
||||
ret[i] = p.text();
|
||||
|
||||
// Fix the line ending
|
||||
if (ret[i].endsWith("\r")) {
|
||||
ret[i] = ret[i] + "\n";
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the header/footer text, if it's not empty
|
||||
*/
|
||||
private void appendHeaderFooter(String text, StringBuffer out) {
|
||||
|
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc
Executable file
Binary file not shown.
@ -60,6 +60,8 @@ public final class TestWordExtractor extends TestCase {
|
||||
private String filename4;
|
||||
// With unicode header and footer
|
||||
private String filename5;
|
||||
// With footnote
|
||||
private String filename6;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
@ -70,6 +72,7 @@ public final class TestWordExtractor extends TestCase {
|
||||
filename3 = pdirname + "/excel_with_embeded.xls";
|
||||
filename4 = dirname + "/ThreeColHeadFoot.doc";
|
||||
filename5 = dirname + "/HeaderFooterUnicode.doc";
|
||||
filename6 = dirname + "/footnote.doc";
|
||||
|
||||
extractor = new WordExtractor(new FileInputStream(filename));
|
||||
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
||||
@ -226,4 +229,49 @@ public final class TestWordExtractor extends TestCase {
|
||||
text.indexOf("The footer, with") > -1
|
||||
);
|
||||
}
|
||||
|
||||
public void testFootnote() throws Exception {
|
||||
HWPFDocument doc = new HWPFDocument(
|
||||
new FileInputStream(filename6)
|
||||
);
|
||||
extractor = new WordExtractor(doc);
|
||||
|
||||
String[] text = extractor.getFootnoteText();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i=0; i<text.length; i++) {
|
||||
b.append(text[i]);
|
||||
}
|
||||
|
||||
assertTrue(b.toString().contains("TestFootnote"));
|
||||
}
|
||||
|
||||
public void testEndnote() throws Exception {
|
||||
HWPFDocument doc = new HWPFDocument(
|
||||
new FileInputStream(filename6)
|
||||
);
|
||||
extractor = new WordExtractor(doc);
|
||||
|
||||
String[] text = extractor.getEndnoteText();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i=0; i<text.length; i++) {
|
||||
b.append(text[i]);
|
||||
}
|
||||
|
||||
assertTrue(b.toString().contains("TestEndnote"));
|
||||
}
|
||||
|
||||
public void testComments() throws Exception {
|
||||
HWPFDocument doc = new HWPFDocument(
|
||||
new FileInputStream(filename6)
|
||||
);
|
||||
extractor = new WordExtractor(doc);
|
||||
|
||||
String[] text = extractor.getCommentsText();
|
||||
StringBuffer b = new StringBuffer();
|
||||
for (int i=0; i<text.length; i++) {
|
||||
b.append(text[i]);
|
||||
}
|
||||
|
||||
assertTrue(b.toString().contains("TestComment"));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user