diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index e935ae62c..91b9cdede 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47571 - Fixed XWPFWordExtractor to extract inserted/deleted text 47548 - Fixed RecordFactoryInputStream to properly read continued DrawingRecords 46419 - Fixed compatibility issue with OpenOffice 3.0 47559 - Fixed compatibility issue with Excel 2008 Mac sp2 diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 81a4add75..2d0caf1b5 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -18,6 +18,7 @@ package org.apache.poi.xwpf.usermodel; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Arrays; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; @@ -56,23 +57,18 @@ public class XWPFParagraph { // TODO - replace this with some sort of XPath expression // to directly find all the CTRs, in the right order ArrayList rs = new ArrayList(); - CTR[] tmp; + rs.addAll(Arrays.asList(paragraph.getRArray())); - // Get the main text runs - tmp = paragraph.getRArray(); - for (int i = 0; i < tmp.length; i++) { - rs.add(tmp[i]); + for (CTSdtRun sdt : paragraph.getSdtArray()) { + CTSdtContentRun run = sdt.getSdtContent(); + rs.addAll(Arrays.asList(run.getRArray())); + } + for (CTRunTrackChange c : paragraph.getDelArray()) { + rs.addAll(Arrays.asList(c.getRArray())); } - // Not sure quite what these are, but they hold - // more text runs - CTSdtRun[] sdts = paragraph.getSdtArray(); - for (int i = 0; i < sdts.length; i++) { - CTSdtContentRun run = sdts[i].getSdtContent(); - tmp = run.getRArray(); - for (int j = 0; j < tmp.length; j++) { - rs.add(tmp[j]); - } + for (CTRunTrackChange c : paragraph.getInsArray()) { + rs.addAll(Arrays.asList(c.getRArray())); } // Get text of the paragraph diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index e923c40fc..fe80baff1 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -192,6 +192,13 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(extractor.getText().contains("XXX")); } + public void testInsertedDeletedText() throws Exception { + XWPFDocument doc = open("delins.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + + assertTrue(extractor.getText().contains("pendant worn")); + assertTrue(extractor.getText().contains("extremely well")); + } //TODO use the same logic for opening test files as in HSSFTestDataSamples private XWPFDocument open(String sampleFileName) throws IOException { diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx b/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx new file mode 100755 index 000000000..b53069135 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/delins.docx differ