From 2d50615c87115bf5626e134742524381797c15b4 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sat, 9 Aug 2008 10:45:19 +0000 Subject: [PATCH] Patch from bug #45592 - improve xwpf text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684219 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/changes.xml | 1 + src/documentation/content/xdocs/status.xml | 1 + .../apache/poi/xwpf/usermodel/XWPFParagraph.java | 14 ++++++++++++++ .../poi/xwpf/extractor/TestXWPFWordExtractor.java | 7 +++++-- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 60cfd0f9f..ac0604d77 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + 45592 - Improve XWPF text extraction to include tables always, and picture text where possible 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index bc6acc93c..89114557f 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 45592 - Improve XWPF text extraction to include tables always, and picture text where possible 45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text 45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor 45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 813ae140d..09fe099e8 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph * including text from pictures in it. */ public String getText() { + return getParagraphText() + getPictureText(); + } + /** + * Returns the text of the paragraph, but not + * of any objects in the paragraph + */ + public String getParagraphText() { return text.toString(); } + /** + * Returns any text from any suitable + * pictures in the paragraph + */ + public String getPictureText() { + return pictureText.toString(); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index e62dd66ce..f29d09d02 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(text.startsWith( " \n(V) ILLUSTRATIVE CASES\n\n" )); - assertTrue(text.endsWith( + assertTrue(text.contains( "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" )); + assertTrue(text.endsWith( + "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n" + )); // Check number of paragraphs int ps = 0; @@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase { for (int i = 0; i < t.length; i++) { if(t[i] == '\n') { ps++; } } - assertEquals(79, ps); + assertEquals(103, ps); } public void testGetWithHyperlinks() throws Exception {