diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 60cfd0f9f..ac0604d77 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + 45592 - Improve XWPF text extraction to include tables always, and picture text where possible 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index bc6acc93c..89114557f 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 45592 - Improve XWPF text extraction to include tables always, and picture text where possible 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 813ae140d..09fe099e8 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph * including text from pictures in it. */ public String getText() { + return getParagraphText() + getPictureText(); + } + /** + * Returns the text of the paragraph, but not + * of any objects in the paragraph + */ + public String getParagraphText() { return text.toString(); } + /** + * Returns any text from any suitable + * pictures in the paragraph + */ + public String getPictureText() { + return pictureText.toString(); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index e62dd66ce..f29d09d02 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(text.startsWith( " \n(V) ILLUSTRATIVE CASES\n\n" )); - assertTrue(text.endsWith( + assertTrue(text.contains( "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" )); + assertTrue(text.endsWith( + "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n" + )); // Check number of paragraphs int ps = 0; @@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase { for (int i = 0; i < t.length; i++) { if(t[i] == '\n') { ps++; } } - assertEquals(79, ps); + assertEquals(103, ps); } public void testGetWithHyperlinks() throws Exception {