[Bug-61354] fix issue with extracting text from Word docs. This closes #66
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1803250 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e1f37388fc
commit
9f28e36438
1
.gitignore
vendored
1
.gitignore
vendored
@ -45,3 +45,4 @@ sonar/*/target
|
|||||||
.ant-targets-build.xml
|
.ant-targets-build.xml
|
||||||
build
|
build
|
||||||
dist
|
dist
|
||||||
|
lib/
|
||||||
|
@ -156,26 +156,34 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
|
|||||||
|
|
||||||
// parse the document with cursor and add
|
// parse the document with cursor and add
|
||||||
// the XmlObject to its lists
|
// the XmlObject to its lists
|
||||||
XmlCursor cursor = ctDocument.getBody().newCursor();
|
XmlCursor docCursor = ctDocument.newCursor();
|
||||||
cursor.selectPath("./*");
|
docCursor.selectPath("./*");
|
||||||
while (cursor.toNextSelection()) {
|
while (docCursor.toNextSelection()) {
|
||||||
XmlObject o = cursor.getObject();
|
XmlObject o = docCursor.getObject();
|
||||||
if (o instanceof CTP) {
|
if (o instanceof CTBody) {
|
||||||
XWPFParagraph p = new XWPFParagraph((CTP) o, this);
|
XmlCursor bodyCursor = o.newCursor();
|
||||||
|
bodyCursor.selectPath("./*");
|
||||||
|
while (bodyCursor.toNextSelection()) {
|
||||||
|
XmlObject bodyObj = bodyCursor.getObject();
|
||||||
|
if (bodyObj instanceof CTP) {
|
||||||
|
XWPFParagraph p = new XWPFParagraph((CTP) bodyObj,
|
||||||
|
this);
|
||||||
bodyElements.add(p);
|
bodyElements.add(p);
|
||||||
paragraphs.add(p);
|
paragraphs.add(p);
|
||||||
} else if (o instanceof CTTbl) {
|
} else if (bodyObj instanceof CTTbl) {
|
||||||
XWPFTable t = new XWPFTable((CTTbl) o, this);
|
XWPFTable t = new XWPFTable((CTTbl) bodyObj, this);
|
||||||
bodyElements.add(t);
|
bodyElements.add(t);
|
||||||
tables.add(t);
|
tables.add(t);
|
||||||
} else if (o instanceof CTSdtBlock) {
|
} else if (bodyObj instanceof CTSdtBlock) {
|
||||||
XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this);
|
XWPFSDT c = new XWPFSDT((CTSdtBlock) bodyObj, this);
|
||||||
bodyElements.add(c);
|
bodyElements.add(c);
|
||||||
contentControls.add(c);
|
contentControls.add(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cursor.dispose();
|
bodyCursor.dispose();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docCursor.dispose();
|
||||||
// Sort out headers and footers
|
// Sort out headers and footers
|
||||||
if (doc.getDocument().getBody().getSectPr() != null)
|
if (doc.getDocument().getBody().getSectPr() != null)
|
||||||
headerFooterPolicy = new XWPFHeaderFooterPolicy(this);
|
headerFooterPolicy = new XWPFHeaderFooterPolicy(this);
|
||||||
|
@ -411,4 +411,14 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
"In Sequence:\n|X||_||X|\n", extractor.getText());
|
"In Sequence:\n|X||_||X|\n", extractor.getText());
|
||||||
extractor.close();
|
extractor.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMultipleBodyBug() throws IOException {
|
||||||
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("MultipleBodyBug.docx");
|
||||||
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
||||||
|
assertEquals("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n"
|
||||||
|
+ "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n"
|
||||||
|
+ "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n",
|
||||||
|
extractor.getText());
|
||||||
|
extractor.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
BIN
test-data/document/MultipleBodyBug.docx
Normal file
BIN
test-data/document/MultipleBodyBug.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user