[Bug-61354] fix issue with extracting text from Word docs. This closes #66

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1803250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2017-07-28 07:42:23 +00:00
parent e1f37388fc
commit 9f28e36438
4 changed files with 37 additions and 18 deletions

1
.gitignore vendored
View File

@ -45,3 +45,4 @@ sonar/*/target
.ant-targets-build.xml
build
dist
lib/

View File

@ -156,26 +156,34 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
// parse the document with cursor and add
// the XmlObject to its lists
XmlCursor cursor = ctDocument.getBody().newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject o = cursor.getObject();
if (o instanceof CTP) {
XWPFParagraph p = new XWPFParagraph((CTP) o, this);
XmlCursor docCursor = ctDocument.newCursor();
docCursor.selectPath("./*");
while (docCursor.toNextSelection()) {
XmlObject o = docCursor.getObject();
if (o instanceof CTBody) {
XmlCursor bodyCursor = o.newCursor();
bodyCursor.selectPath("./*");
while (bodyCursor.toNextSelection()) {
XmlObject bodyObj = bodyCursor.getObject();
if (bodyObj instanceof CTP) {
XWPFParagraph p = new XWPFParagraph((CTP) bodyObj,
this);
bodyElements.add(p);
paragraphs.add(p);
} else if (o instanceof CTTbl) {
XWPFTable t = new XWPFTable((CTTbl) o, this);
} else if (bodyObj instanceof CTTbl) {
XWPFTable t = new XWPFTable((CTTbl) bodyObj, this);
bodyElements.add(t);
tables.add(t);
} else if (o instanceof CTSdtBlock) {
XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this);
} else if (bodyObj instanceof CTSdtBlock) {
XWPFSDT c = new XWPFSDT((CTSdtBlock) bodyObj, this);
bodyElements.add(c);
contentControls.add(c);
}
}
cursor.dispose();
bodyCursor.dispose();
}
}
docCursor.dispose();
// Sort out headers and footers
if (doc.getDocument().getBody().getSectPr() != null)
headerFooterPolicy = new XWPFHeaderFooterPolicy(this);

View File

@ -411,4 +411,14 @@ public class TestXWPFWordExtractor extends TestCase {
"In Sequence:\n|X||_||X|\n", extractor.getText());
extractor.close();
}
public void testMultipleBodyBug() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("MultipleBodyBug.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n"
+ "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n"
+ "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n",
extractor.getText());
extractor.close();
}
}

Binary file not shown.