Patch from bug #45592 - improve xwpf text extraction

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684219 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-09 10:45:19 +00:00
parent beb26b5c76
commit 2d50615c87
4 changed files with 21 additions and 2 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.5.1-beta2" date="2008-??-??"> <release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action> <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.5.1-beta2" date="2008-??-??"> <release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action> <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>

View File

@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph
* including text from pictures in it. * including text from pictures in it.
*/ */
public String getText() { public String getText() {
return getParagraphText() + getPictureText();
}
/**
* Returns the text of the paragraph, but not
* of any objects in the paragraph
*/
public String getParagraphText() {
return text.toString(); return text.toString();
} }
/**
* Returns any text from any suitable
* pictures in the paragraph
*/
public String getPictureText() {
return pictureText.toString();
}
} }

View File

@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase {
assertTrue(text.startsWith( assertTrue(text.startsWith(
" \n(V) ILLUSTRATIVE CASES\n\n" " \n(V) ILLUSTRATIVE CASES\n\n"
)); ));
assertTrue(text.endsWith( assertTrue(text.contains(
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
)); ));
assertTrue(text.endsWith(
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n"
));
// Check number of paragraphs // Check number of paragraphs
int ps = 0; int ps = 0;
@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase {
for (int i = 0; i < t.length; i++) { for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; } if(t[i] == '\n') { ps++; }
} }
assertEquals(79, ps); assertEquals(103, ps);
} }
public void testGetWithHyperlinks() throws Exception { public void testGetWithHyperlinks() throws Exception {