BUG 54771 extract text from SDTs at the cell level within a table row

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1602955 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2014-06-16 18:46:00 +00:00
parent 1a969ea635
commit af7b947bb9
15 changed files with 469 additions and 119 deletions

View File

@ -27,6 +27,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.IBodyElement; import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.ICell;
import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.IRunElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink; import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@ -34,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.poi.xwpf.usermodel.XWPFTableRow;
@ -161,14 +163,18 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
} }
private void appendTableText(StringBuffer text, XWPFTable table){ private void appendTableText(StringBuffer text, XWPFTable table) {
//this works recursively to pull embedded tables from tables //this works recursively to pull embedded tables from tables
for (XWPFTableRow row : table.getRows()){ for (XWPFTableRow row : table.getRows()) {
List<XWPFTableCell> cells = row.getTableCells(); List<ICell> cells = row.getTableICells();
for (int i = 0; i < cells.size(); i++){ for (int i = 0; i < cells.size(); i++) {
XWPFTableCell cell = cells.get(i); ICell cell = cells.get(i);
text.append(cell.getTextRecursively()); if (cell instanceof XWPFTableCell) {
if (i < cells.size()-1){ text.append(((XWPFTableCell)cell).getTextRecursively());
} else if (cell instanceof XWPFSDTCell) {
text.append(((XWPFSDTCell)cell).getContent().getText());
}
if (i < cells.size()-1) {
text.append("\t"); text.append("\t");
} }
} }

View File

@ -0,0 +1,113 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import java.util.List;
import org.apache.poi.POIXMLDocumentPart;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
/**
* Experimental abstract class that is a base for XWPFSDT and XWPFSDTCell
*
* WARNING - APIs expected to change rapidly.
*
* These classes have so far been built only for read-only processing.
*
*/
public abstract class AbstractXWPFSDT implements ISDTContents {
private final String title;
private final String tag;
private final IBody part;
public AbstractXWPFSDT(CTSdtPr pr, IBody part){
List<CTString> aliases = pr.getAliasList();
if (aliases != null && aliases.size() > 0){
title = aliases.get(0).getVal();
} else {
title = "";
}
List<CTString> tags = pr.getTagList();
if (tags != null && tags.size() > 0){
tag = tags.get(0).getVal();
} else {
tag = "";
}
this.part = part;
}
/**
*
* @return first SDT Title
*/
public String getTitle(){
return title;
}
/**
*
* @return first SDT Tag
*/
public String getTag(){
return tag;
}
/**
*
* @return the content object
*/
public abstract ISDTContent getContent();
/**
*
* @return null
*/
public IBody getBody() {
return null;
}
/**
*
* @return document part
*/
public POIXMLDocumentPart getPart() {
return part.getPart();
}
/**
*
* @return partType
*/
public BodyType getPartType() {
return BodyType.CONTENTCONTROL;
}
/**
*
* @return element type
*/
public BodyElementType getElementType() {
return BodyElementType.CONTENTCONTROL;
}
public XWPFDocument getDocument() {
return part.getXWPFDocument();
}
}

View File

@ -0,0 +1,27 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
/**
* Interface for anything that can be at a table cell level:
* {@link XWPFTableCell}, {@link XWPFSDTCell}
* <p>
* Schematically something like this:
* &lt;tr&gt;&lt;tc/&gt;&lt;tc/&gt;&lt;sdt&gt&lt;tc/&gt;&lt;/sdt&gt;&lt;/tr&gt;
*/
public interface ICell {
}

View File

@ -17,7 +17,7 @@
package org.apache.poi.xwpf.usermodel; package org.apache.poi.xwpf.usermodel;
/** /**
* Interface for anything that can be within a STD: * Interface for anything that can be within an SDT:
* {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph}, * {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
* {@link XWPFSDT} etc * {@link XWPFSDT} etc
*/ */

View File

@ -16,95 +16,32 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.xwpf.usermodel; package org.apache.poi.xwpf.usermodel;
import java.util.List;
import org.apache.poi.POIXMLDocumentPart;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
/** /**
* Experimental class to offer rudimentary read-only processing of * Experimental class to offer rudimentary read-only processing of
* of StructuredDocumentTags/ContentControl * of StructuredDocumentTags/ContentControl
*
*
* *
* WARNING - APIs expected to change rapidly * WARNING - APIs expected to change rapidly
* *
*/ */
public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement { public class XWPFSDT extends AbstractXWPFSDT
private final String title; implements IBodyElement, IRunBody, ISDTContents, IRunElement {
private final String tag; private final ISDTContent content;
private final XWPFSDTContent content;
private final IBody part;
public XWPFSDT(CTSdtRun sdtRun, IBody part){ public XWPFSDT(CTSdtRun sdtRun, IBody part){
this.part = part; super(sdtRun.getSdtPr(), part);
this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this); this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
CTSdtPr pr = sdtRun.getSdtPr();
List<CTString> aliases = pr.getAliasList();
if (aliases != null && aliases.size() > 0){
title = aliases.get(0).getVal();
} else {
title = "";
}
@SuppressWarnings("deprecation")
CTString[] array = pr.getTagArray();
if (array != null && array.length > 0){
tag = array[0].getVal();
} else {
tag = "";
}
} }
public XWPFSDT(CTSdtBlock block, IBody part){ public XWPFSDT(CTSdtBlock block, IBody part){
this.part = part; super(block.getSdtPr(), part);
this.content = new XWPFSDTContent( block.getSdtContent(), part, this); this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
CTSdtPr pr = block.getSdtPr();
List<CTString> aliases = pr.getAliasList();
if (aliases != null && aliases.size() > 0){
title = aliases.get(0).getVal();
} else {
title = "";
}
@SuppressWarnings("deprecation")
CTString[] array = pr.getTagArray();
if (array != null && array.length > 0){
tag = array[0].getVal();
} else {
tag = "";
}
} }
public String getTitle(){
return title; public ISDTContent getContent(){
}
public String getTag(){
return tag;
}
public XWPFSDTContent getContent(){
return content; return content;
} }
public IBody getBody() {
// TODO Auto-generated method stub
return null;
}
public POIXMLDocumentPart getPart() {
return part.getPart();
}
public BodyType getPartType() {
return BodyType.CONTENTCONTROL;
}
public BodyElementType getElementType() {
return BodyElementType.CONTENTCONTROL;
}
public XWPFDocument getDocument() {
return part.getXWPFDocument();
}
} }

View File

@ -0,0 +1,44 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
/**
* Experimental class to offer rudimentary read-only processing of
* of StructuredDocumentTags/ContentControl that can appear
* in a table row as if a table cell.
* <p>
* These can contain one or more cells or other SDTs within them.
*
* WARNING - APIs expected to change rapidly
*
*/
public class XWPFSDTCell extends AbstractXWPFSDT implements ICell {
private final XWPFSDTContentCell cellContent;
public XWPFSDTCell(CTSdtCell sdtCell, XWPFTableRow xwpfTableRow, IBody part){
super(sdtCell.getSdtPr(), part);
cellContent = new XWPFSDTContentCell(sdtCell.getSdtContent(), xwpfTableRow, part);
}
@Override
public ISDTContent getContent(){
return cellContent;
}
}

View File

@ -39,7 +39,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
* WARNING - APIs expected to change rapidly * WARNING - APIs expected to change rapidly
* *
*/ */
public class XWPFSDTContent { public class XWPFSDTContent implements ISDTContent {
// private final IBody part; // private final IBody part;
// private final XWPFDocument document; // private final XWPFDocument document;
@ -87,10 +87,10 @@ public class XWPFSDTContent {
for (int i = 0; i < bodyElements.size(); i++){ for (int i = 0; i < bodyElements.size(); i++){
Object o = bodyElements.get(i); Object o = bodyElements.get(i);
if (o instanceof XWPFParagraph){ if (o instanceof XWPFParagraph){
text.append(((XWPFParagraph)o).getText()); appendParagraph((XWPFParagraph)o, text);
addNewLine = true; addNewLine = true;
} else if (o instanceof XWPFTable){ } else if (o instanceof XWPFTable){
text.append(((XWPFTable)o).getText()); appendTable((XWPFTable)o, text);
addNewLine = true; addNewLine = true;
} else if (o instanceof XWPFSDT){ } else if (o instanceof XWPFSDT){
text.append(((XWPFSDT)o).getContent().getText()); text.append(((XWPFSDT)o).getContent().getText());
@ -106,6 +106,31 @@ public class XWPFSDTContent {
return text.toString(); return text.toString();
} }
private void appendTable(XWPFTable table, StringBuilder text) {
//this works recursively to pull embedded tables from within cells
for (XWPFTableRow row : table.getRows()) {
List<ICell> cells = row.getTableICells();
for (int i = 0; i < cells.size(); i++) {
ICell cell = cells.get(i);
if (cell instanceof XWPFTableCell) {
text.append(((XWPFTableCell)cell).getTextRecursively());
} else if (cell instanceof XWPFSDTCell) {
text.append(((XWPFSDTCell)cell).getContent().getText());
}
if (i < cells.size()-1) {
text.append("\t");
}
}
text.append('\n');
}
}
private void appendParagraph(XWPFParagraph paragraph, StringBuilder text) {
for(IRunElement run : paragraph.getRuns()) {
text.append(run.toString());
}
}
public String toString(){ public String toString(){
return getText(); return getText();
} }

View File

@ -0,0 +1,114 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import javax.xml.namespace.QName;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlCursor.TokenType;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentCell;
/**
* Experimental class to offer rudimentary read-only processing of
* of the XWPFSDTCellContent.
* WARNING - APIs expected to change rapidly
*
*/
public class XWPFSDTContentCell implements ISDTContent {
//A full implementation would grab the icells
//that a content cell can contain. This would require
//significant changes, including changing the notion that the
//parent of a cell can be not just a row, but an sdt.
//For now we are just grabbing the text out of the text tokentypes.
//private List<ICell> cells = new ArrayList<ICell>().
private String text = "";
public XWPFSDTContentCell(CTSdtContentCell sdtContentCell,
XWPFTableRow xwpfTableRow, IBody part){
super();
StringBuilder sb = new StringBuilder();
XmlCursor cursor = sdtContentCell.newCursor();
//keep track of the following,
//and add "\n" only before the start of a body
//element if it is not the first body element.
//index of cell in row
int tcCnt = 0;
//count of body objects
int iBodyCnt = 0;
int depth = 1;
while (cursor.hasNextToken() && depth > 0) {
TokenType t = cursor.toNextToken();
if (t.isText()){
sb.append(cursor.getTextValue());
} else if (isStartToken(cursor, "tr")) {
tcCnt = 0;
iBodyCnt = 0;
} else if (isStartToken(cursor, "tc")) {
if (tcCnt++ > 0) {
sb.append("\t");
}
iBodyCnt = 0;
} else if (isStartToken(cursor, "p") ||
isStartToken(cursor, "tbl") ||
isStartToken(cursor, "sdt")) {
if (iBodyCnt > 0) {
sb.append("\n");
}
iBodyCnt++;
}
if (cursor.isStart()){
depth++;
} else if (cursor.isEnd()){
depth--;
}
}
text = sb.toString();
}
private boolean isStartToken(XmlCursor cursor, String string) {
if (! cursor.isStart()) {
return false;
}
QName qName = cursor.getName();
if (qName != null && qName.getLocalPart() != null &&
qName.getLocalPart().equals(string)) {
return true;
}
return false;
}
public String getText(){
return text;
}
public String toString(){
return getText();
}
}

View File

@ -159,6 +159,13 @@ public class XWPFTable implements IBodyElement, ISDTContents {
} }
/** /**
* Convenience method to extract text in cells. This
* does not extract text recursively in cells, and it does not
* currently include text in SDT (form) components.
* <p>
* To get all text within a table, see XWPFWordExtractor's appendTableText
* as an example.
*
* @return text * @return text
*/ */
public String getText() { public String getText() {

View File

@ -42,7 +42,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STVerticalJc;
* Represents a Cell within a {@link XWPFTable}. The * Represents a Cell within a {@link XWPFTable}. The
* Cell is the thing that holds the actual content (paragraphs etc) * Cell is the thing that holds the actual content (paragraphs etc)
*/ */
public class XWPFTableCell implements IBody { public class XWPFTableCell implements IBody, ICell {
private final CTTc ctTc; private final CTTc ctTc;
protected List<XWPFParagraph> paragraphs = null; protected List<XWPFParagraph> paragraphs = null;
protected List<XWPFTable> tables = null; protected List<XWPFTable> tables = null;

View File

@ -21,9 +21,12 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.poi.util.Internal; import org.apache.poi.util.Internal;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
@ -121,6 +124,29 @@ public class XWPFTableRow {
return table; return table;
} }
/**
* create and return a list of all XWPFTableCell
* who belongs to this row
* @return a list of {@link XWPFTableCell}
*/
public List<ICell> getTableICells(){
List<ICell> cells = new ArrayList<ICell>();
//Can't use ctRow.getTcList because that only gets table cells
//Can't use ctRow.getSdtList because that only gets sdts that are at cell level
XmlCursor cursor = ctRow.newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject o = cursor.getObject();
if (o instanceof CTTc){
cells.add(new XWPFTableCell((CTTc)o, this, table.getBody()));
} else if (o instanceof CTSdtCell) {
cells.add(new XWPFSDTCell((CTSdtCell)o, this, table.getBody()));
}
}
return cells;
}
/** /**
* create and return a list of all XWPFTableCell * create and return a list of all XWPFTableCell
* who belongs to this row * who belongs to this row

View File

@ -18,6 +18,8 @@
package org.apache.poi.xwpf.extractor; package org.apache.poi.xwpf.extractor;
import java.io.IOException; import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase; import junit.framework.TestCase;
@ -327,12 +329,14 @@ public class TestXWPFWordExtractor extends TestCase {
String[] targs = new String[]{ String[] targs = new String[]{
"header_rich_text", "header_rich_text",
"rich_text", "rich_text",
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table", "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
"plain_text_no_newlines", "plain_text_no_newlines",
"plain_text_with_newlines1\nplain_text_with_newlines2\n", "plain_text_with_newlines1\nplain_text_with_newlines2\n",
"watermelon\n", "watermelon\n",
"dirt\n", "dirt\n",
"4/16/2013\n", "4/16/2013\n",
"rich_text_in_cell",
"abc",
"rich_text_in_paragraph_in_cell", "rich_text_in_paragraph_in_cell",
"footer_rich_text", "footer_rich_text",
"footnote_sdt", "footnote_sdt",
@ -352,6 +356,36 @@ public class TestXWPFWordExtractor extends TestCase {
} }
assertEquals("controlled content loading hit count", targs.length, hits); assertEquals("controlled content loading hit count", targs.length, hits);
ex.close(); ex.close();
doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
targs = new String[]{
"bb",
"test subtitle\n",
"test user\n",
};
ex = new XWPFWordExtractor(doc);
s = ex.getText().toLowerCase();
//At one point in development there were three copies of the text.
//This ensures that there is only one copy.
for (String targ : targs){
Matcher m = Pattern.compile(targ).matcher(s);
int hit = 0;
while (m.find()) {
hit++;
}
assertEquals("controlled content loading-"+targ, 1, hit);
}
//"test\n" appears twice: once as the "title" and once in the text.
//This also happens when you save this document as text from MSWord.
Matcher m = Pattern.compile("test\n").matcher(s);
int hit = 0;
while (m.find()){
hit++;
}
assertEquals("test<N>", 2, hit);
ex.close();
} }
/** No Header or Footer in document */ /** No Header or Footer in document */

View File

@ -18,8 +18,10 @@
package org.apache.poi.xwpf.usermodel; package org.apache.poi.xwpf.usermodel;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import junit.framework.TestCase; import junit.framework.TestCase;
@ -35,15 +37,16 @@ public final class TestXWPFSDT extends TestCase {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
String tag = null; String tag = null;
String title= null; String title= null;
List<XWPFSDT> sdts = extractAllSDTs(doc); List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
for (XWPFSDT sdt :sdts){ for (AbstractXWPFSDT sdt :sdts){
if (sdt.getContent().toString().equals("Rich_text")){ if (sdt.getContent().toString().equals("Rich_text")){
tag = "MyTag"; tag = "MyTag";
title = "MyTitle"; title = "MyTitle";
break; break;
} }
} }
assertEquals("controls size", 12, sdts.size()); assertEquals("controls size", 13, sdts.size());
assertEquals("tag", "MyTag", tag); assertEquals("tag", "MyTag", tag);
assertEquals("title", "MyTitle", title); assertEquals("title", "MyTitle", title);
@ -54,12 +57,13 @@ public final class TestXWPFSDT extends TestCase {
String[] contents = new String[]{ String[] contents = new String[]{
"header_rich_text", "header_rich_text",
"Rich_text", "Rich_text",
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table", "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nRich_text_post_table",
"Plain_text_no_newlines", "Plain_text_no_newlines",
"Plain_text_with_newlines1\nplain_text_with_newlines2", "Plain_text_with_newlines1\nplain_text_with_newlines2",
"Watermelon", "Watermelon",
"Dirt", "Dirt",
"4/16/2013", "4/16/2013",
"Rich_text_in_cell",
"rich_text_in_paragraph_in_cell", "rich_text_in_paragraph_in_cell",
"Footer_rich_text", "Footer_rich_text",
"Footnote_sdt", "Footnote_sdt",
@ -67,31 +71,40 @@ public final class TestXWPFSDT extends TestCase {
}; };
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
List<XWPFSDT> sdts = extractAllSDTs(doc); List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
assertEquals("number of sdts", contents.length, sdts.size()); assertEquals("number of sdts", contents.length, sdts.size());
for (int i = 0; i < sdts.size(); i++){//contents.length; i++){ for (int i = 0; i < contents.length; i++){
XWPFSDT sdt = sdts.get(i); AbstractXWPFSDT sdt = sdts.get(i);
assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString()); assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
} }
} }
/**
* POI-54771 and TIKA-1317
*/
public void testSDTAsCell() throws Exception {
//Bug54771a.docx and Bug54771b.docx test slightly
//different recursion patterns. Keep both!
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
String text = sdts.get(0).getContent().getText();
assertEquals(2, sdts.size());
assertTrue(text.indexOf("Test") > -1);
text = sdts.get(1).getContent().getText();
assertTrue(text.indexOf("Test Subtitle") > -1);
assertTrue(text.indexOf("Test User") > -1);
assertTrue(text.indexOf("Test") < text.indexOf("Test Subtitle"));
doc = XWPFTestDataSamples.openSampleDocument("Bug54771b.docx");
sdts = extractAllSDTs(doc);
assertEquals(3, sdts.size());
assertTrue(sdts.get(0).getContent().getText().indexOf("Test") > -1);
assertTrue(sdts.get(1).getContent().getText().indexOf("Test Subtitle") > -1);
assertTrue(sdts.get(2).getContent().getText().indexOf("Test User") > -1);
public void testFailureToGetSDTAsCell() throws Exception{
/**
* The current code fails to extract an sdt if it comprises/is the parent
* of a cell in a table.
*/
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
List<XWPFSDT> sdts = extractAllSDTs(doc);
boolean found = false;
for (XWPFSDT sdt : sdts){
if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){
found = true;
}
}
assertEquals("SDT as cell known failure", false, found);
} }
/** /**
@ -99,7 +112,7 @@ public final class TestXWPFSDT extends TestCase {
*/ */
public void testNewLinesBetweenRuns() throws Exception{ public void testNewLinesBetweenRuns() throws Exception{
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx"); XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx");
List<XWPFSDT> sdts = extractAllSDTs(doc); List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
List<String> targs = new ArrayList<String>(); List<String> targs = new ArrayList<String>();
//these test newlines and tabs in paragraphs/body elements //these test newlines and tabs in paragraphs/body elements
targs.add("Rich-text1 abcdefghi"); targs.add("Rich-text1 abcdefghi");
@ -114,14 +127,14 @@ public final class TestXWPFSDT extends TestCase {
targs.add("sdt_incell2 abcdefg"); targs.add("sdt_incell2 abcdefg");
for (int i = 0; i < sdts.size(); i++){ for (int i = 0; i < sdts.size(); i++){
XWPFSDT sdt = sdts.get(i); AbstractXWPFSDT sdt = sdts.get(i);
assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText()); assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText());
} }
} }
private List<XWPFSDT> extractAllSDTs(XWPFDocument doc){ private List<AbstractXWPFSDT> extractAllSDTs(XWPFDocument doc){
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>(); List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
List<XWPFHeader> headers = doc.getHeaderList(); List<XWPFHeader> headers = doc.getHeaderList();
for (XWPFHeader header : headers){ for (XWPFHeader header : headers){
@ -135,7 +148,6 @@ public final class TestXWPFSDT extends TestCase {
} }
for (XWPFFootnote footnote : doc.getFootnotes()){ for (XWPFFootnote footnote : doc.getFootnotes()){
sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements())); sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
} }
for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){ for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
@ -144,8 +156,8 @@ public final class TestXWPFSDT extends TestCase {
return sdts; return sdts;
} }
private List<XWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){ private List<AbstractXWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>(); List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
for (IBodyElement e : elements){ for (IBodyElement e : elements){
if (e instanceof XWPFSDT){ if (e instanceof XWPFSDT){
XWPFSDT sdt = (XWPFSDT)e; XWPFSDT sdt = (XWPFSDT)e;
@ -167,11 +179,16 @@ public final class TestXWPFSDT extends TestCase {
return sdts; return sdts;
} }
private List<XWPFSDT> extractSDTsFromTable(XWPFTable table){ private List<AbstractXWPFSDT> extractSDTsFromTable(XWPFTable table) {
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
for (XWPFTableRow r : table.getRows()){ List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
for (XWPFTableCell c : r.getTableCells()){ for (XWPFTableRow r : table.getRows()) {
sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements())); for (ICell c : r.getTableICells()) {
if (c instanceof XWPFSDTCell) {
sdts.add((XWPFSDTCell)c);
} else if (c instanceof XWPFTableCell) {
sdts.addAll(extractSDTsFromBodyElements(((XWPFTableCell)c).getBodyElements()));
}
} }
} }
return sdts; return sdts;

Binary file not shown.

Binary file not shown.