BUG 54771 extract text from SDTs at the cell level within a table row
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1602955 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a969ea635
commit
af7b947bb9
@ -27,6 +27,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
|||||||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||||
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||||
import org.apache.poi.xwpf.usermodel.IBodyElement;
|
import org.apache.poi.xwpf.usermodel.IBodyElement;
|
||||||
|
import org.apache.poi.xwpf.usermodel.ICell;
|
||||||
import org.apache.poi.xwpf.usermodel.IRunElement;
|
import org.apache.poi.xwpf.usermodel.IRunElement;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
|
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
|
||||||
@ -34,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
|||||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFSDT;
|
import org.apache.poi.xwpf.usermodel.XWPFSDT;
|
||||||
|
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
|
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
|
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
|
||||||
@ -161,14 +163,18 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void appendTableText(StringBuffer text, XWPFTable table){
|
private void appendTableText(StringBuffer text, XWPFTable table) {
|
||||||
//this works recursively to pull embedded tables from tables
|
//this works recursively to pull embedded tables from tables
|
||||||
for (XWPFTableRow row : table.getRows()){
|
for (XWPFTableRow row : table.getRows()) {
|
||||||
List<XWPFTableCell> cells = row.getTableCells();
|
List<ICell> cells = row.getTableICells();
|
||||||
for (int i = 0; i < cells.size(); i++){
|
for (int i = 0; i < cells.size(); i++) {
|
||||||
XWPFTableCell cell = cells.get(i);
|
ICell cell = cells.get(i);
|
||||||
text.append(cell.getTextRecursively());
|
if (cell instanceof XWPFTableCell) {
|
||||||
if (i < cells.size()-1){
|
text.append(((XWPFTableCell)cell).getTextRecursively());
|
||||||
|
} else if (cell instanceof XWPFSDTCell) {
|
||||||
|
text.append(((XWPFSDTCell)cell).getContent().getText());
|
||||||
|
}
|
||||||
|
if (i < cells.size()-1) {
|
||||||
text.append("\t");
|
text.append("\t");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,113 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLDocumentPart;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Experimental abstract class that is a base for XWPFSDT and XWPFSDTCell
|
||||||
|
*
|
||||||
|
* WARNING - APIs expected to change rapidly.
|
||||||
|
*
|
||||||
|
* These classes have so far been built only for read-only processing.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public abstract class AbstractXWPFSDT implements ISDTContents {
|
||||||
|
private final String title;
|
||||||
|
private final String tag;
|
||||||
|
private final IBody part;
|
||||||
|
|
||||||
|
public AbstractXWPFSDT(CTSdtPr pr, IBody part){
|
||||||
|
|
||||||
|
List<CTString> aliases = pr.getAliasList();
|
||||||
|
if (aliases != null && aliases.size() > 0){
|
||||||
|
title = aliases.get(0).getVal();
|
||||||
|
} else {
|
||||||
|
title = "";
|
||||||
|
}
|
||||||
|
List<CTString> tags = pr.getTagList();
|
||||||
|
if (tags != null && tags.size() > 0){
|
||||||
|
tag = tags.get(0).getVal();
|
||||||
|
} else {
|
||||||
|
tag = "";
|
||||||
|
}
|
||||||
|
this.part = part;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return first SDT Title
|
||||||
|
*/
|
||||||
|
public String getTitle(){
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return first SDT Tag
|
||||||
|
*/
|
||||||
|
public String getTag(){
|
||||||
|
return tag;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return the content object
|
||||||
|
*/
|
||||||
|
public abstract ISDTContent getContent();
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return null
|
||||||
|
*/
|
||||||
|
public IBody getBody() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return document part
|
||||||
|
*/
|
||||||
|
public POIXMLDocumentPart getPart() {
|
||||||
|
return part.getPart();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return partType
|
||||||
|
*/
|
||||||
|
public BodyType getPartType() {
|
||||||
|
return BodyType.CONTENTCONTROL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return element type
|
||||||
|
*/
|
||||||
|
public BodyElementType getElementType() {
|
||||||
|
return BodyElementType.CONTENTCONTROL;
|
||||||
|
}
|
||||||
|
|
||||||
|
public XWPFDocument getDocument() {
|
||||||
|
return part.getXWPFDocument();
|
||||||
|
}
|
||||||
|
}
|
27
src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java
Normal file
27
src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Interface for anything that can be at a table cell level:
|
||||||
|
* {@link XWPFTableCell}, {@link XWPFSDTCell}
|
||||||
|
* <p>
|
||||||
|
* Schematically something like this:
|
||||||
|
* <tr><tc/><tc/><sdt><tc/></sdt></tr>
|
||||||
|
*/
|
||||||
|
public interface ICell {
|
||||||
|
}
|
@ -17,7 +17,7 @@
|
|||||||
package org.apache.poi.xwpf.usermodel;
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Interface for anything that can be within a STD:
|
* Interface for anything that can be within an SDT:
|
||||||
* {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
|
* {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
|
||||||
* {@link XWPFSDT} etc
|
* {@link XWPFSDT} etc
|
||||||
*/
|
*/
|
||||||
|
@ -16,95 +16,32 @@
|
|||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.xwpf.usermodel;
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.poi.POIXMLDocumentPart;
|
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
|
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental class to offer rudimentary read-only processing of
|
* Experimental class to offer rudimentary read-only processing of
|
||||||
* of StructuredDocumentTags/ContentControl
|
* of StructuredDocumentTags/ContentControl
|
||||||
*
|
*
|
||||||
*
|
|
||||||
*
|
|
||||||
* WARNING - APIs expected to change rapidly
|
* WARNING - APIs expected to change rapidly
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement {
|
public class XWPFSDT extends AbstractXWPFSDT
|
||||||
private final String title;
|
implements IBodyElement, IRunBody, ISDTContents, IRunElement {
|
||||||
private final String tag;
|
private final ISDTContent content;
|
||||||
private final XWPFSDTContent content;
|
|
||||||
private final IBody part;
|
|
||||||
|
|
||||||
public XWPFSDT(CTSdtRun sdtRun, IBody part){
|
public XWPFSDT(CTSdtRun sdtRun, IBody part){
|
||||||
this.part = part;
|
super(sdtRun.getSdtPr(), part);
|
||||||
this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
|
this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
|
||||||
CTSdtPr pr = sdtRun.getSdtPr();
|
|
||||||
List<CTString> aliases = pr.getAliasList();
|
|
||||||
if (aliases != null && aliases.size() > 0){
|
|
||||||
title = aliases.get(0).getVal();
|
|
||||||
} else {
|
|
||||||
title = "";
|
|
||||||
}
|
|
||||||
@SuppressWarnings("deprecation")
|
|
||||||
CTString[] array = pr.getTagArray();
|
|
||||||
if (array != null && array.length > 0){
|
|
||||||
tag = array[0].getVal();
|
|
||||||
} else {
|
|
||||||
tag = "";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
public XWPFSDT(CTSdtBlock block, IBody part){
|
public XWPFSDT(CTSdtBlock block, IBody part){
|
||||||
this.part = part;
|
super(block.getSdtPr(), part);
|
||||||
this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
|
this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
|
||||||
CTSdtPr pr = block.getSdtPr();
|
|
||||||
List<CTString> aliases = pr.getAliasList();
|
|
||||||
if (aliases != null && aliases.size() > 0){
|
|
||||||
title = aliases.get(0).getVal();
|
|
||||||
} else {
|
|
||||||
title = "";
|
|
||||||
}
|
|
||||||
@SuppressWarnings("deprecation")
|
|
||||||
CTString[] array = pr.getTagArray();
|
|
||||||
if (array != null && array.length > 0){
|
|
||||||
tag = array[0].getVal();
|
|
||||||
} else {
|
|
||||||
tag = "";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
public ISDTContent getContent(){
|
||||||
public String getTitle(){
|
|
||||||
return title;
|
|
||||||
}
|
|
||||||
public String getTag(){
|
|
||||||
return tag;
|
|
||||||
}
|
|
||||||
public XWPFSDTContent getContent(){
|
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IBody getBody() {
|
|
||||||
// TODO Auto-generated method stub
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public POIXMLDocumentPart getPart() {
|
|
||||||
return part.getPart();
|
|
||||||
}
|
|
||||||
|
|
||||||
public BodyType getPartType() {
|
|
||||||
return BodyType.CONTENTCONTROL;
|
|
||||||
}
|
|
||||||
|
|
||||||
public BodyElementType getElementType() {
|
|
||||||
return BodyElementType.CONTENTCONTROL;
|
|
||||||
}
|
|
||||||
|
|
||||||
public XWPFDocument getDocument() {
|
|
||||||
return part.getXWPFDocument();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,44 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Experimental class to offer rudimentary read-only processing of
|
||||||
|
* of StructuredDocumentTags/ContentControl that can appear
|
||||||
|
* in a table row as if a table cell.
|
||||||
|
* <p>
|
||||||
|
* These can contain one or more cells or other SDTs within them.
|
||||||
|
*
|
||||||
|
* WARNING - APIs expected to change rapidly
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class XWPFSDTCell extends AbstractXWPFSDT implements ICell {
|
||||||
|
private final XWPFSDTContentCell cellContent;
|
||||||
|
|
||||||
|
public XWPFSDTCell(CTSdtCell sdtCell, XWPFTableRow xwpfTableRow, IBody part){
|
||||||
|
super(sdtCell.getSdtPr(), part);
|
||||||
|
cellContent = new XWPFSDTContentCell(sdtCell.getSdtContent(), xwpfTableRow, part);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ISDTContent getContent(){
|
||||||
|
return cellContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -39,7 +39,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
|
|||||||
* WARNING - APIs expected to change rapidly
|
* WARNING - APIs expected to change rapidly
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class XWPFSDTContent {
|
public class XWPFSDTContent implements ISDTContent {
|
||||||
|
|
||||||
// private final IBody part;
|
// private final IBody part;
|
||||||
// private final XWPFDocument document;
|
// private final XWPFDocument document;
|
||||||
@ -87,10 +87,10 @@ public class XWPFSDTContent {
|
|||||||
for (int i = 0; i < bodyElements.size(); i++){
|
for (int i = 0; i < bodyElements.size(); i++){
|
||||||
Object o = bodyElements.get(i);
|
Object o = bodyElements.get(i);
|
||||||
if (o instanceof XWPFParagraph){
|
if (o instanceof XWPFParagraph){
|
||||||
text.append(((XWPFParagraph)o).getText());
|
appendParagraph((XWPFParagraph)o, text);
|
||||||
addNewLine = true;
|
addNewLine = true;
|
||||||
} else if (o instanceof XWPFTable){
|
} else if (o instanceof XWPFTable){
|
||||||
text.append(((XWPFTable)o).getText());
|
appendTable((XWPFTable)o, text);
|
||||||
addNewLine = true;
|
addNewLine = true;
|
||||||
} else if (o instanceof XWPFSDT){
|
} else if (o instanceof XWPFSDT){
|
||||||
text.append(((XWPFSDT)o).getContent().getText());
|
text.append(((XWPFSDT)o).getContent().getText());
|
||||||
@ -106,6 +106,31 @@ public class XWPFSDTContent {
|
|||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void appendTable(XWPFTable table, StringBuilder text) {
|
||||||
|
//this works recursively to pull embedded tables from within cells
|
||||||
|
for (XWPFTableRow row : table.getRows()) {
|
||||||
|
List<ICell> cells = row.getTableICells();
|
||||||
|
for (int i = 0; i < cells.size(); i++) {
|
||||||
|
ICell cell = cells.get(i);
|
||||||
|
if (cell instanceof XWPFTableCell) {
|
||||||
|
text.append(((XWPFTableCell)cell).getTextRecursively());
|
||||||
|
} else if (cell instanceof XWPFSDTCell) {
|
||||||
|
text.append(((XWPFSDTCell)cell).getContent().getText());
|
||||||
|
}
|
||||||
|
if (i < cells.size()-1) {
|
||||||
|
text.append("\t");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text.append('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendParagraph(XWPFParagraph paragraph, StringBuilder text) {
|
||||||
|
for(IRunElement run : paragraph.getRuns()) {
|
||||||
|
text.append(run.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public String toString(){
|
public String toString(){
|
||||||
return getText();
|
return getText();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,114 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
|
|
||||||
|
import javax.xml.namespace.QName;
|
||||||
|
|
||||||
|
import org.apache.xmlbeans.XmlCursor;
|
||||||
|
import org.apache.xmlbeans.XmlCursor.TokenType;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentCell;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Experimental class to offer rudimentary read-only processing of
|
||||||
|
* of the XWPFSDTCellContent.
|
||||||
|
|
||||||
|
* WARNING - APIs expected to change rapidly
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class XWPFSDTContentCell implements ISDTContent {
|
||||||
|
|
||||||
|
//A full implementation would grab the icells
|
||||||
|
//that a content cell can contain. This would require
|
||||||
|
//significant changes, including changing the notion that the
|
||||||
|
//parent of a cell can be not just a row, but an sdt.
|
||||||
|
//For now we are just grabbing the text out of the text tokentypes.
|
||||||
|
|
||||||
|
//private List<ICell> cells = new ArrayList<ICell>().
|
||||||
|
|
||||||
|
private String text = "";
|
||||||
|
public XWPFSDTContentCell(CTSdtContentCell sdtContentCell,
|
||||||
|
XWPFTableRow xwpfTableRow, IBody part){
|
||||||
|
super();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
XmlCursor cursor = sdtContentCell.newCursor();
|
||||||
|
|
||||||
|
//keep track of the following,
|
||||||
|
//and add "\n" only before the start of a body
|
||||||
|
//element if it is not the first body element.
|
||||||
|
|
||||||
|
//index of cell in row
|
||||||
|
int tcCnt = 0;
|
||||||
|
//count of body objects
|
||||||
|
int iBodyCnt = 0;
|
||||||
|
int depth = 1;
|
||||||
|
|
||||||
|
while (cursor.hasNextToken() && depth > 0) {
|
||||||
|
TokenType t = cursor.toNextToken();
|
||||||
|
if (t.isText()){
|
||||||
|
sb.append(cursor.getTextValue());
|
||||||
|
} else if (isStartToken(cursor, "tr")) {
|
||||||
|
tcCnt = 0;
|
||||||
|
iBodyCnt = 0;
|
||||||
|
} else if (isStartToken(cursor, "tc")) {
|
||||||
|
if (tcCnt++ > 0) {
|
||||||
|
sb.append("\t");
|
||||||
|
}
|
||||||
|
iBodyCnt = 0;
|
||||||
|
} else if (isStartToken(cursor, "p") ||
|
||||||
|
isStartToken(cursor, "tbl") ||
|
||||||
|
isStartToken(cursor, "sdt")) {
|
||||||
|
if (iBodyCnt > 0) {
|
||||||
|
sb.append("\n");
|
||||||
|
}
|
||||||
|
iBodyCnt++;
|
||||||
|
}
|
||||||
|
if (cursor.isStart()){
|
||||||
|
depth++;
|
||||||
|
} else if (cursor.isEnd()){
|
||||||
|
depth--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text = sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isStartToken(XmlCursor cursor, String string) {
|
||||||
|
if (! cursor.isStart()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
QName qName = cursor.getName();
|
||||||
|
if (qName != null && qName.getLocalPart() != null &&
|
||||||
|
qName.getLocalPart().equals(string)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getText(){
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString(){
|
||||||
|
return getText();
|
||||||
|
}
|
||||||
|
}
|
@ -159,6 +159,13 @@ public class XWPFTable implements IBodyElement, ISDTContents {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Convenience method to extract text in cells. This
|
||||||
|
* does not extract text recursively in cells, and it does not
|
||||||
|
* currently include text in SDT (form) components.
|
||||||
|
* <p>
|
||||||
|
* To get all text within a table, see XWPFWordExtractor's appendTableText
|
||||||
|
* as an example.
|
||||||
|
*
|
||||||
* @return text
|
* @return text
|
||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
|
@ -42,7 +42,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STVerticalJc;
|
|||||||
* Represents a Cell within a {@link XWPFTable}. The
|
* Represents a Cell within a {@link XWPFTable}. The
|
||||||
* Cell is the thing that holds the actual content (paragraphs etc)
|
* Cell is the thing that holds the actual content (paragraphs etc)
|
||||||
*/
|
*/
|
||||||
public class XWPFTableCell implements IBody {
|
public class XWPFTableCell implements IBody, ICell {
|
||||||
private final CTTc ctTc;
|
private final CTTc ctTc;
|
||||||
protected List<XWPFParagraph> paragraphs = null;
|
protected List<XWPFParagraph> paragraphs = null;
|
||||||
protected List<XWPFTable> tables = null;
|
protected List<XWPFTable> tables = null;
|
||||||
|
@ -21,9 +21,12 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.xmlbeans.XmlCursor;
|
||||||
|
import org.apache.xmlbeans.XmlObject;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
|
||||||
@ -121,6 +124,29 @@ public class XWPFTableRow {
|
|||||||
return table;
|
return table;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create and return a list of all XWPFTableCell
|
||||||
|
* who belongs to this row
|
||||||
|
* @return a list of {@link XWPFTableCell}
|
||||||
|
*/
|
||||||
|
public List<ICell> getTableICells(){
|
||||||
|
|
||||||
|
List<ICell> cells = new ArrayList<ICell>();
|
||||||
|
//Can't use ctRow.getTcList because that only gets table cells
|
||||||
|
//Can't use ctRow.getSdtList because that only gets sdts that are at cell level
|
||||||
|
XmlCursor cursor = ctRow.newCursor();
|
||||||
|
cursor.selectPath("./*");
|
||||||
|
while (cursor.toNextSelection()) {
|
||||||
|
XmlObject o = cursor.getObject();
|
||||||
|
if (o instanceof CTTc){
|
||||||
|
cells.add(new XWPFTableCell((CTTc)o, this, table.getBody()));
|
||||||
|
} else if (o instanceof CTSdtCell) {
|
||||||
|
cells.add(new XWPFSDTCell((CTSdtCell)o, this, table.getBody()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cells;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create and return a list of all XWPFTableCell
|
* create and return a list of all XWPFTableCell
|
||||||
* who belongs to this row
|
* who belongs to this row
|
||||||
|
@ -18,6 +18,8 @@
|
|||||||
package org.apache.poi.xwpf.extractor;
|
package org.apache.poi.xwpf.extractor;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
@ -327,12 +329,14 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
String[] targs = new String[]{
|
String[] targs = new String[]{
|
||||||
"header_rich_text",
|
"header_rich_text",
|
||||||
"rich_text",
|
"rich_text",
|
||||||
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table",
|
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
|
||||||
"plain_text_no_newlines",
|
"plain_text_no_newlines",
|
||||||
"plain_text_with_newlines1\nplain_text_with_newlines2\n",
|
"plain_text_with_newlines1\nplain_text_with_newlines2\n",
|
||||||
"watermelon\n",
|
"watermelon\n",
|
||||||
"dirt\n",
|
"dirt\n",
|
||||||
"4/16/2013\n",
|
"4/16/2013\n",
|
||||||
|
"rich_text_in_cell",
|
||||||
|
"abc",
|
||||||
"rich_text_in_paragraph_in_cell",
|
"rich_text_in_paragraph_in_cell",
|
||||||
"footer_rich_text",
|
"footer_rich_text",
|
||||||
"footnote_sdt",
|
"footnote_sdt",
|
||||||
@ -352,6 +356,36 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
}
|
}
|
||||||
assertEquals("controlled content loading hit count", targs.length, hits);
|
assertEquals("controlled content loading hit count", targs.length, hits);
|
||||||
ex.close();
|
ex.close();
|
||||||
|
|
||||||
|
|
||||||
|
doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
|
||||||
|
targs = new String[]{
|
||||||
|
"bb",
|
||||||
|
"test subtitle\n",
|
||||||
|
"test user\n",
|
||||||
|
};
|
||||||
|
ex = new XWPFWordExtractor(doc);
|
||||||
|
s = ex.getText().toLowerCase();
|
||||||
|
|
||||||
|
//At one point in development there were three copies of the text.
|
||||||
|
//This ensures that there is only one copy.
|
||||||
|
for (String targ : targs){
|
||||||
|
Matcher m = Pattern.compile(targ).matcher(s);
|
||||||
|
int hit = 0;
|
||||||
|
while (m.find()) {
|
||||||
|
hit++;
|
||||||
|
}
|
||||||
|
assertEquals("controlled content loading-"+targ, 1, hit);
|
||||||
|
}
|
||||||
|
//"test\n" appears twice: once as the "title" and once in the text.
|
||||||
|
//This also happens when you save this document as text from MSWord.
|
||||||
|
Matcher m = Pattern.compile("test\n").matcher(s);
|
||||||
|
int hit = 0;
|
||||||
|
while (m.find()){
|
||||||
|
hit++;
|
||||||
|
}
|
||||||
|
assertEquals("test<N>", 2, hit);
|
||||||
|
ex.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** No Header or Footer in document */
|
/** No Header or Footer in document */
|
||||||
|
@ -18,8 +18,10 @@
|
|||||||
package org.apache.poi.xwpf.usermodel;
|
package org.apache.poi.xwpf.usermodel;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
@ -35,15 +37,16 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
||||||
String tag = null;
|
String tag = null;
|
||||||
String title= null;
|
String title= null;
|
||||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||||
for (XWPFSDT sdt :sdts){
|
for (AbstractXWPFSDT sdt :sdts){
|
||||||
if (sdt.getContent().toString().equals("Rich_text")){
|
if (sdt.getContent().toString().equals("Rich_text")){
|
||||||
tag = "MyTag";
|
tag = "MyTag";
|
||||||
title = "MyTitle";
|
title = "MyTitle";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
assertEquals("controls size", 12, sdts.size());
|
assertEquals("controls size", 13, sdts.size());
|
||||||
|
|
||||||
assertEquals("tag", "MyTag", tag);
|
assertEquals("tag", "MyTag", tag);
|
||||||
assertEquals("title", "MyTitle", title);
|
assertEquals("title", "MyTitle", title);
|
||||||
@ -54,12 +57,13 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
String[] contents = new String[]{
|
String[] contents = new String[]{
|
||||||
"header_rich_text",
|
"header_rich_text",
|
||||||
"Rich_text",
|
"Rich_text",
|
||||||
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table",
|
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nRich_text_post_table",
|
||||||
"Plain_text_no_newlines",
|
"Plain_text_no_newlines",
|
||||||
"Plain_text_with_newlines1\nplain_text_with_newlines2",
|
"Plain_text_with_newlines1\nplain_text_with_newlines2",
|
||||||
"Watermelon",
|
"Watermelon",
|
||||||
"Dirt",
|
"Dirt",
|
||||||
"4/16/2013",
|
"4/16/2013",
|
||||||
|
"Rich_text_in_cell",
|
||||||
"rich_text_in_paragraph_in_cell",
|
"rich_text_in_paragraph_in_cell",
|
||||||
"Footer_rich_text",
|
"Footer_rich_text",
|
||||||
"Footnote_sdt",
|
"Footnote_sdt",
|
||||||
@ -67,31 +71,40 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
|
|
||||||
};
|
};
|
||||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
||||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||||
|
|
||||||
assertEquals("number of sdts", contents.length, sdts.size());
|
assertEquals("number of sdts", contents.length, sdts.size());
|
||||||
|
|
||||||
for (int i = 0; i < sdts.size(); i++){//contents.length; i++){
|
for (int i = 0; i < contents.length; i++){
|
||||||
XWPFSDT sdt = sdts.get(i);
|
AbstractXWPFSDT sdt = sdts.get(i);
|
||||||
|
|
||||||
assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
|
assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFailureToGetSDTAsCell() throws Exception{
|
|
||||||
/**
|
/**
|
||||||
* The current code fails to extract an sdt if it comprises/is the parent
|
* POI-54771 and TIKA-1317
|
||||||
* of a cell in a table.
|
|
||||||
*/
|
*/
|
||||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
public void testSDTAsCell() throws Exception {
|
||||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
//Bug54771a.docx and Bug54771b.docx test slightly
|
||||||
boolean found = false;
|
//different recursion patterns. Keep both!
|
||||||
for (XWPFSDT sdt : sdts){
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
|
||||||
if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){
|
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||||
found = true;
|
String text = sdts.get(0).getContent().getText();
|
||||||
}
|
assertEquals(2, sdts.size());
|
||||||
}
|
assertTrue(text.indexOf("Test") > -1);
|
||||||
assertEquals("SDT as cell known failure", false, found);
|
|
||||||
|
text = sdts.get(1).getContent().getText();
|
||||||
|
assertTrue(text.indexOf("Test Subtitle") > -1);
|
||||||
|
assertTrue(text.indexOf("Test User") > -1);
|
||||||
|
assertTrue(text.indexOf("Test") < text.indexOf("Test Subtitle"));
|
||||||
|
|
||||||
|
doc = XWPFTestDataSamples.openSampleDocument("Bug54771b.docx");
|
||||||
|
sdts = extractAllSDTs(doc);
|
||||||
|
assertEquals(3, sdts.size());
|
||||||
|
assertTrue(sdts.get(0).getContent().getText().indexOf("Test") > -1);
|
||||||
|
|
||||||
|
assertTrue(sdts.get(1).getContent().getText().indexOf("Test Subtitle") > -1);
|
||||||
|
assertTrue(sdts.get(2).getContent().getText().indexOf("Test User") > -1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,7 +112,7 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
*/
|
*/
|
||||||
public void testNewLinesBetweenRuns() throws Exception{
|
public void testNewLinesBetweenRuns() throws Exception{
|
||||||
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx");
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx");
|
||||||
List<XWPFSDT> sdts = extractAllSDTs(doc);
|
List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
|
||||||
List<String> targs = new ArrayList<String>();
|
List<String> targs = new ArrayList<String>();
|
||||||
//these test newlines and tabs in paragraphs/body elements
|
//these test newlines and tabs in paragraphs/body elements
|
||||||
targs.add("Rich-text1 abcdefghi");
|
targs.add("Rich-text1 abcdefghi");
|
||||||
@ -114,14 +127,14 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
targs.add("sdt_incell2 abcdefg");
|
targs.add("sdt_incell2 abcdefg");
|
||||||
|
|
||||||
for (int i = 0; i < sdts.size(); i++){
|
for (int i = 0; i < sdts.size(); i++){
|
||||||
XWPFSDT sdt = sdts.get(i);
|
AbstractXWPFSDT sdt = sdts.get(i);
|
||||||
assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText());
|
assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<XWPFSDT> extractAllSDTs(XWPFDocument doc){
|
private List<AbstractXWPFSDT> extractAllSDTs(XWPFDocument doc){
|
||||||
|
|
||||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||||
|
|
||||||
List<XWPFHeader> headers = doc.getHeaderList();
|
List<XWPFHeader> headers = doc.getHeaderList();
|
||||||
for (XWPFHeader header : headers){
|
for (XWPFHeader header : headers){
|
||||||
@ -135,7 +148,6 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (XWPFFootnote footnote : doc.getFootnotes()){
|
for (XWPFFootnote footnote : doc.getFootnotes()){
|
||||||
|
|
||||||
sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
|
sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
|
||||||
}
|
}
|
||||||
for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
|
for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
|
||||||
@ -144,8 +156,8 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
return sdts;
|
return sdts;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<XWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
|
private List<AbstractXWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
|
||||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||||
for (IBodyElement e : elements){
|
for (IBodyElement e : elements){
|
||||||
if (e instanceof XWPFSDT){
|
if (e instanceof XWPFSDT){
|
||||||
XWPFSDT sdt = (XWPFSDT)e;
|
XWPFSDT sdt = (XWPFSDT)e;
|
||||||
@ -167,11 +179,16 @@ public final class TestXWPFSDT extends TestCase {
|
|||||||
return sdts;
|
return sdts;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<XWPFSDT> extractSDTsFromTable(XWPFTable table){
|
private List<AbstractXWPFSDT> extractSDTsFromTable(XWPFTable table) {
|
||||||
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
|
|
||||||
for (XWPFTableRow r : table.getRows()){
|
List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
|
||||||
for (XWPFTableCell c : r.getTableCells()){
|
for (XWPFTableRow r : table.getRows()) {
|
||||||
sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements()));
|
for (ICell c : r.getTableICells()) {
|
||||||
|
if (c instanceof XWPFSDTCell) {
|
||||||
|
sdts.add((XWPFSDTCell)c);
|
||||||
|
} else if (c instanceof XWPFTableCell) {
|
||||||
|
sdts.addAll(extractSDTsFromBodyElements(((XWPFTableCell)c).getBodyElements()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sdts;
|
return sdts;
|
||||||
|
BIN
test-data/document/Bug54771a.docx
Normal file
BIN
test-data/document/Bug54771a.docx
Normal file
Binary file not shown.
BIN
test-data/document/Bug54771b.docx
Normal file
BIN
test-data/document/Bug54771b.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user