[Bug-61354] fix issue with extracting text from Word docs. This closes #66

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1803250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2017-07-28 07:42:23 +00:00
parent e1f37388fc
commit 9f28e36438
4 changed files with 37 additions and 18 deletions

1
.gitignore vendored
View File

@ -45,3 +45,4 @@ sonar/*/target
.ant-targets-build.xml .ant-targets-build.xml
build build
dist dist
lib/

View File

@ -156,26 +156,34 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
// parse the document with cursor and add // parse the document with cursor and add
// the XmlObject to its lists // the XmlObject to its lists
XmlCursor cursor = ctDocument.getBody().newCursor(); XmlCursor docCursor = ctDocument.newCursor();
cursor.selectPath("./*"); docCursor.selectPath("./*");
while (cursor.toNextSelection()) { while (docCursor.toNextSelection()) {
XmlObject o = cursor.getObject(); XmlObject o = docCursor.getObject();
if (o instanceof CTP) { if (o instanceof CTBody) {
XWPFParagraph p = new XWPFParagraph((CTP) o, this); XmlCursor bodyCursor = o.newCursor();
bodyElements.add(p); bodyCursor.selectPath("./*");
paragraphs.add(p); while (bodyCursor.toNextSelection()) {
} else if (o instanceof CTTbl) { XmlObject bodyObj = bodyCursor.getObject();
XWPFTable t = new XWPFTable((CTTbl) o, this); if (bodyObj instanceof CTP) {
bodyElements.add(t); XWPFParagraph p = new XWPFParagraph((CTP) bodyObj,
tables.add(t); this);
} else if (o instanceof CTSdtBlock) { bodyElements.add(p);
XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this); paragraphs.add(p);
bodyElements.add(c); } else if (bodyObj instanceof CTTbl) {
contentControls.add(c); XWPFTable t = new XWPFTable((CTTbl) bodyObj, this);
bodyElements.add(t);
tables.add(t);
} else if (bodyObj instanceof CTSdtBlock) {
XWPFSDT c = new XWPFSDT((CTSdtBlock) bodyObj, this);
bodyElements.add(c);
contentControls.add(c);
}
}
bodyCursor.dispose();
} }
} }
cursor.dispose(); docCursor.dispose();
// Sort out headers and footers // Sort out headers and footers
if (doc.getDocument().getBody().getSectPr() != null) if (doc.getDocument().getBody().getSectPr() != null)
headerFooterPolicy = new XWPFHeaderFooterPolicy(this); headerFooterPolicy = new XWPFHeaderFooterPolicy(this);

View File

@ -411,4 +411,14 @@ public class TestXWPFWordExtractor extends TestCase {
"In Sequence:\n|X||_||X|\n", extractor.getText()); "In Sequence:\n|X||_||X|\n", extractor.getText());
extractor.close(); extractor.close();
} }
public void testMultipleBodyBug() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("MultipleBodyBug.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n"
+ "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n"
+ "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n",
extractor.getText());
extractor.close();
}
} }

Binary file not shown.