BUG 54771 extract text from SDTs at the cell level within a table row

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1602955 13f79535-47bb-0310-9956-ffa450edef68
2014-06-16 18:46:00 +00:00 · 2014-06-16 18:46:00 +00:00 · af7b947bb9
commit af7b947bb9
parent 1a969ea635
15 changed files with 469 additions and 119 deletions
--- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
@ -27,6 +27,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
 import org.apache.poi.xwpf.usermodel.IBodyElement;
+import org.apache.poi.xwpf.usermodel.ICell;
 import org.apache.poi.xwpf.usermodel.IRunElement;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@ -34,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFSDT;
+import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
 import org.apache.poi.xwpf.usermodel.XWPFTable;
 import org.apache.poi.xwpf.usermodel.XWPFTableCell;
 import org.apache.poi.xwpf.usermodel.XWPFTableRow;
@ -161,14 +163,18 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
     
   }

-   private void appendTableText(StringBuffer text, XWPFTable table){
+   private void appendTableText(StringBuffer text, XWPFTable table) {
      //this works recursively to pull embedded tables from tables
-      for (XWPFTableRow row : table.getRows()){
-          List<XWPFTableCell> cells = row.getTableCells();
-          for (int i = 0; i < cells.size(); i++){
-              XWPFTableCell cell = cells.get(i);
-              text.append(cell.getTextRecursively());
-              if (i < cells.size()-1){
+      for (XWPFTableRow row : table.getRows()) {
+          List<ICell> cells = row.getTableICells();
+          for (int i = 0; i < cells.size(); i++) {
+              ICell cell = cells.get(i);
+              if (cell instanceof XWPFTableCell) {
+                  text.append(((XWPFTableCell)cell).getTextRecursively());
+              } else if (cell instanceof XWPFSDTCell) {
+                  text.append(((XWPFSDTCell)cell).getContent().getText());
+              }
+              if (i < cells.size()-1) {
                  text.append("\t");
              }
          }
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/AbstractXWPFSDT.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/AbstractXWPFSDT.java
@ -0,0 +1,113 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.usermodel;
+
+import java.util.List;
+
+import org.apache.poi.POIXMLDocumentPart;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
+
+/**
+ * Experimental abstract class that is a base for XWPFSDT and XWPFSDTCell
+ *  
+ * WARNING - APIs expected to change rapidly.
+ * 
+ * These classes have so far been built only for read-only processing.
+ * 
+ */
+public abstract class AbstractXWPFSDT implements ISDTContents {
+   private final String title;
+   private final String tag;
+   private final IBody part;
+
+   public AbstractXWPFSDT(CTSdtPr pr, IBody part){
+       
+       List<CTString> aliases = pr.getAliasList();
+       if (aliases != null && aliases.size() > 0){
+          title = aliases.get(0).getVal();
+       } else {
+          title = "";
+       }
+       List<CTString> tags = pr.getTagList();
+       if (tags != null && tags.size() > 0){
+          tag = tags.get(0).getVal();
+       } else {
+          tag = "";
+       }
+       this.part = part;
+       
+   }
+
+   /**
+    * 
+    * @return first SDT Title
+    */
+   public String getTitle(){
+      return title;
+   }
+   
+   /**
+    * 
+    * @return first SDT Tag
+    */
+   public String getTag(){
+      return tag;
+   }
+   
+   /**
+    * 
+    * @return the content object
+    */
+   public abstract ISDTContent getContent();
+
+   /**
+    * 
+    * @return null
+    */
+   public IBody getBody() {
+      return null;
+   }
+
+   /**
+    * 
+    * @return document part
+    */
+   public POIXMLDocumentPart getPart() {
+      return part.getPart();
+   }
+
+   /**
+    * 
+    * @return partType
+    */
+   public BodyType getPartType() {
+      return BodyType.CONTENTCONTROL;
+   }
+
+   /**
+    * 
+    * @return element type
+    */
+   public BodyElementType getElementType() {
+      return BodyElementType.CONTENTCONTROL;
+   }
+
+   public XWPFDocument getDocument() {
+      return part.getXWPFDocument();
+   }
+}
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java
@ -0,0 +1,27 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.usermodel;
+
+/**
+ * Interface for anything that can be at a table cell level:
+ *  {@link XWPFTableCell}, {@link XWPFSDTCell}
+ *  <p>
+ *  Schematically something like this:
+ *  &lt;tr&gt;&lt;tc/&gt;&lt;tc/&gt;&lt;sdt&gt&lt;tc/&gt;&lt;/sdt&gt;&lt;/tr&gt;
+ */
+public interface ICell {
+}
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java
@ -17,7 +17,7 @@
 package org.apache.poi.xwpf.usermodel;

 /**
- * Interface for anything that can be within a STD:
+ * Interface for anything that can be within an SDT:
 *  {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
 *  {@link XWPFSDT} etc
 */
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java
@ -16,95 +16,32 @@
 ==================================================================== */
 package org.apache.poi.xwpf.usermodel;

-import java.util.List;
-
-import org.apache.poi.POIXMLDocumentPart;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;

 /**
 * Experimental class to offer rudimentary read-only processing of 
 *  of StructuredDocumentTags/ContentControl
- *  
- *
 *
 * WARNING - APIs expected to change rapidly
 * 
 */
-public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement {
-   private final String title;
-   private final String tag;
-   private final XWPFSDTContent content;
-   private final IBody part;
+public class XWPFSDT extends AbstractXWPFSDT
+    implements IBodyElement, IRunBody, ISDTContents, IRunElement {
+   private final ISDTContent content;

   public XWPFSDT(CTSdtRun sdtRun, IBody part){
-       this.part = part;
+       super(sdtRun.getSdtPr(), part);
       this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
-       CTSdtPr pr = sdtRun.getSdtPr();
-       List<CTString> aliases = pr.getAliasList();
-       if (aliases != null && aliases.size() > 0){
-          title = aliases.get(0).getVal();
-       } else {
-          title = "";
-       }
-       @SuppressWarnings("deprecation")
-       CTString[] array = pr.getTagArray();
-       if (array != null && array.length > 0){
-          tag = array[0].getVal();
-       } else {
-          tag = "";
-       }
-  
   }
+   
   public XWPFSDT(CTSdtBlock block, IBody part){
-      this.part = part;
+      super(block.getSdtPr(), part);
      this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
-      CTSdtPr pr = block.getSdtPr();
-      List<CTString> aliases = pr.getAliasList();
-      if (aliases != null && aliases.size() > 0){
-         title = aliases.get(0).getVal();
-      } else {
-         title = "";
-      }
-      @SuppressWarnings("deprecation")
-      CTString[] array = pr.getTagArray();
-      if (array != null && array.length > 0){
-         tag = array[0].getVal();
-      } else {
-         tag = "";
-      }
- 
   }
-   public String getTitle(){
-      return title;
-   }
-   public String getTag(){
-      return tag;
-   }
-   public XWPFSDTContent getContent(){
+
+   public ISDTContent getContent(){
      return content;
   }

-   public IBody getBody() {
-      // TODO Auto-generated method stub
-      return null;
-   }
-
-   public POIXMLDocumentPart getPart() {
-      return part.getPart();
-   }
-
-   public BodyType getPartType() {
-      return BodyType.CONTENTCONTROL;
-   }
-
-   public BodyElementType getElementType() {
-      return BodyElementType.CONTENTCONTROL;
-   }
-
-   public XWPFDocument getDocument() {
-      return part.getXWPFDocument();
-   }
 }
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTCell.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTCell.java
@ -0,0 +1,44 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.usermodel;
+
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
+
+/**
+ * Experimental class to offer rudimentary read-only processing of 
+ *  of StructuredDocumentTags/ContentControl that can appear
+ *  in a table row as if a table cell.
+ *  <p>
+ *  These can contain one or more cells or other SDTs within them.
+ *
+ * WARNING - APIs expected to change rapidly
+ * 
+ */
+public class XWPFSDTCell extends AbstractXWPFSDT implements ICell {
+   private final XWPFSDTContentCell cellContent;
+
+   public XWPFSDTCell(CTSdtCell sdtCell, XWPFTableRow xwpfTableRow, IBody part){
+       super(sdtCell.getSdtPr(), part);
+       cellContent = new XWPFSDTContentCell(sdtCell.getSdtContent(), xwpfTableRow, part);
+   }
+
+   @Override
+   public ISDTContent getContent(){
+      return cellContent;
+   }
+   
+}
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java
@ -39,7 +39,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
 * WARNING - APIs expected to change rapidly
 * 
 */
-public class XWPFSDTContent  {
+public class XWPFSDTContent implements ISDTContent {

    // private final IBody part;
    // private final XWPFDocument document;
@ -87,10 +87,10 @@ public class XWPFSDTContent  {
        for (int i = 0; i < bodyElements.size(); i++){
            Object o = bodyElements.get(i);
            if (o instanceof XWPFParagraph){
-                text.append(((XWPFParagraph)o).getText());
+                appendParagraph((XWPFParagraph)o, text);
                addNewLine = true;
            } else if (o instanceof XWPFTable){
-                text.append(((XWPFTable)o).getText());
+                appendTable((XWPFTable)o, text);
                addNewLine = true;
            } else if (o instanceof XWPFSDT){
                text.append(((XWPFSDT)o).getContent().getText());
@ -106,6 +106,31 @@ public class XWPFSDTContent  {
        return text.toString();
    }

+    private void appendTable(XWPFTable table, StringBuilder text) {
+        //this works recursively to pull embedded tables from within cells
+        for (XWPFTableRow row : table.getRows()) {
+            List<ICell> cells = row.getTableICells();
+            for (int i = 0; i < cells.size(); i++) {
+                ICell cell = cells.get(i);
+                if (cell instanceof XWPFTableCell) {
+                    text.append(((XWPFTableCell)cell).getTextRecursively());
+                } else if (cell instanceof XWPFSDTCell) {
+                    text.append(((XWPFSDTCell)cell).getContent().getText());
+                }
+                if (i < cells.size()-1) {
+                    text.append("\t");
+                }
+            }
+            text.append('\n');
+        }
+    }
+    
+    private void appendParagraph(XWPFParagraph paragraph, StringBuilder text) {
+        for(IRunElement run : paragraph.getRuns()) {
+            text.append(run.toString());
+        }
+    }
+    
    public String toString(){
        return getText();
    }
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContentCell.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContentCell.java
@ -0,0 +1,114 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xwpf.usermodel;
+
+
+import javax.xml.namespace.QName;
+
+import org.apache.xmlbeans.XmlCursor;
+import org.apache.xmlbeans.XmlCursor.TokenType;
+
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentCell;
+
+
+
+/**
+ * Experimental class to offer rudimentary read-only processing of 
+ *  of the XWPFSDTCellContent.  
+
+ * WARNING - APIs expected to change rapidly
+ * 
+ */
+public class XWPFSDTContentCell implements ISDTContent {
+
+    //A full implementation would grab the icells
+    //that a content cell can contain.  This would require
+    //significant changes, including changing the notion that the
+    //parent of a cell can be not just a row, but an sdt.
+    //For now we are just grabbing the text out of the text tokentypes.
+
+    //private List<ICell> cells = new ArrayList<ICell>().
+
+    private String text = "";
+    public XWPFSDTContentCell(CTSdtContentCell sdtContentCell, 
+            XWPFTableRow xwpfTableRow, IBody part){
+        super();
+        StringBuilder sb = new StringBuilder();
+        XmlCursor cursor = sdtContentCell.newCursor();
+
+        //keep track of the following,
+        //and add "\n" only before the start of a body
+        //element if it is not the first body element.
+        
+        //index of cell in row
+        int tcCnt = 0;
+        //count of body objects
+        int iBodyCnt = 0;
+        int depth = 1;
+
+        while (cursor.hasNextToken() && depth > 0) {
+            TokenType t = cursor.toNextToken();
+            if (t.isText()){
+                sb.append(cursor.getTextValue());
+            } else if (isStartToken(cursor, "tr")) {
+                tcCnt = 0;
+                iBodyCnt = 0;
+            } else if (isStartToken(cursor, "tc")) {
+                if (tcCnt++ > 0) {
+                    sb.append("\t");
+                }
+                iBodyCnt = 0;
+            } else if (isStartToken(cursor, "p") ||
+                    isStartToken(cursor, "tbl") ||
+                    isStartToken(cursor, "sdt")) {
+                if (iBodyCnt > 0) {
+                    sb.append("\n");
+                }
+                iBodyCnt++;
+            }
+            if (cursor.isStart()){
+                depth++;
+            } else if (cursor.isEnd()){
+                depth--;
+            }
+        }
+        text = sb.toString();
+    }
+
+
+
+    private boolean isStartToken(XmlCursor cursor, String string) {
+        if (! cursor.isStart()) {
+            return false;
+        }
+        QName qName = cursor.getName();
+        if (qName != null && qName.getLocalPart() != null &&
+                qName.getLocalPart().equals(string)) {
+            return true;
+        }
+        return false;
+    }
+
+
+    public String getText(){
+        return text;
+    }
+
+    public String toString(){
+        return getText();
+    }
+}
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java
@ -159,6 +159,13 @@ public class XWPFTable implements IBodyElement, ISDTContents {
    }

    /**
+     * Convenience method to extract text in cells.  This
+     * does not extract text recursively in cells, and it does not
+     * currently include text in SDT (form) components.
+     * <p>
+     * To get all text within a table, see XWPFWordExtractor's appendTableText
+     * as an example. 
+     *
     * @return text
     */
    public String getText() {
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java
@ -42,7 +42,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STVerticalJc;
 * Represents a Cell within a {@link XWPFTable}. The
 *  Cell is the thing that holds the actual content (paragraphs etc)
 */
-public class XWPFTableCell implements IBody {
+public class XWPFTableCell implements IBody, ICell {
    private final CTTc ctTc;
    protected List<XWPFParagraph> paragraphs = null;
    protected List<XWPFTable> tables = null;
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java
@ -21,9 +21,12 @@ import java.util.ArrayList;
 import java.util.List;

 import org.apache.poi.util.Internal;
+import org.apache.xmlbeans.XmlCursor;
+import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
@ -121,6 +124,29 @@ public class XWPFTableRow {
 	return table;
    }

+    /**
+     * create and return a list of all XWPFTableCell
+     * who belongs to this row
+     * @return a list of {@link XWPFTableCell} 
+     */
+    public List<ICell> getTableICells(){
+    
+        List<ICell> cells = new ArrayList<ICell>();
+        //Can't use ctRow.getTcList because that only gets table cells
+        //Can't use ctRow.getSdtList because that only gets sdts that are at cell level
+        XmlCursor cursor = ctRow.newCursor();
+        cursor.selectPath("./*");
+        while (cursor.toNextSelection()) {
+            XmlObject o = cursor.getObject();
+            if (o instanceof CTTc){
+                cells.add(new XWPFTableCell((CTTc)o, this, table.getBody()));
+            } else if (o instanceof CTSdtCell) {
+                cells.add(new XWPFSDTCell((CTSdtCell)o, this, table.getBody()));
+            }
+        }
+        return cells;
+    }
+
    /**
     * create and return a list of all XWPFTableCell
     * who belongs to this row
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@ -18,6 +18,8 @@
 package org.apache.poi.xwpf.extractor;

 import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import junit.framework.TestCase;

@ -327,12 +329,14 @@ public class TestXWPFWordExtractor extends TestCase {
        String[] targs = new String[]{
                "header_rich_text",
                "rich_text",
-                "rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table",
+                "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
                "plain_text_no_newlines",
                "plain_text_with_newlines1\nplain_text_with_newlines2\n",
                "watermelon\n",
                "dirt\n",
                "4/16/2013\n",
+                "rich_text_in_cell",
+                "abc",
                "rich_text_in_paragraph_in_cell",
                "footer_rich_text",
                "footnote_sdt",
@ -352,6 +356,36 @@ public class TestXWPFWordExtractor extends TestCase {
        }
        assertEquals("controlled content loading hit count", targs.length, hits);
        ex.close();
+        
+        
+        doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
+        targs = new String[]{
+                "bb",
+                "test subtitle\n",
+                "test user\n",
+        };
+        ex = new XWPFWordExtractor(doc);
+        s = ex.getText().toLowerCase();
+        
+        //At one point in development there were three copies of the text.
+        //This ensures that there is only one copy.
+        for (String targ : targs){
+            Matcher m = Pattern.compile(targ).matcher(s);
+            int hit = 0;
+            while (m.find()) {
+                hit++;
+            }
+            assertEquals("controlled content loading-"+targ, 1, hit);
+        }
+        //"test\n" appears twice: once as the "title" and once in the text.
+        //This also happens when you save this document as text from MSWord.
+        Matcher m = Pattern.compile("test\n").matcher(s);
+        int hit = 0;
+        while (m.find()){
+            hit++;
+        }
+        assertEquals("test<N>", 2, hit);
+        ex.close();
    }

    /** No Header or Footer in document */
--- a/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java
@ -18,8 +18,10 @@
 package org.apache.poi.xwpf.usermodel;

 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;

 import junit.framework.TestCase;

@ -35,15 +37,16 @@ public final class TestXWPFSDT extends TestCase {
        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
        String tag = null;
        String title= null;
-        List<XWPFSDT> sdts = extractAllSDTs(doc);
-        for (XWPFSDT sdt :sdts){
+        List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
+        for (AbstractXWPFSDT sdt :sdts){
            if (sdt.getContent().toString().equals("Rich_text")){
                tag = "MyTag";
                title = "MyTitle";
                break;
            }
+            
        }
-        assertEquals("controls size", 12, sdts.size());
+        assertEquals("controls size", 13, sdts.size());

        assertEquals("tag", "MyTag", tag);
        assertEquals("title", "MyTitle", title);
@ -54,12 +57,13 @@ public final class TestXWPFSDT extends TestCase {
        String[] contents = new String[]{
                "header_rich_text",
                "Rich_text",
-                "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table",
+                "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nRich_text_post_table",
                "Plain_text_no_newlines",
                "Plain_text_with_newlines1\nplain_text_with_newlines2",
                "Watermelon",
                "Dirt",
                "4/16/2013",
+                "Rich_text_in_cell",
                "rich_text_in_paragraph_in_cell",
                "Footer_rich_text",
                "Footnote_sdt",
@ -67,31 +71,40 @@ public final class TestXWPFSDT extends TestCase {

        };
        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
-        List<XWPFSDT> sdts = extractAllSDTs(doc);
+        List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);

        assertEquals("number of sdts", contents.length, sdts.size());

-        for (int i = 0; i < sdts.size(); i++){//contents.length; i++){
-            XWPFSDT sdt = sdts.get(i);
-
+        for (int i = 0; i < contents.length; i++){
+            AbstractXWPFSDT sdt = sdts.get(i);
            assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
        } 
    }
+    /**
+     * POI-54771 and TIKA-1317
+     */
+    public void testSDTAsCell() throws Exception {
+        //Bug54771a.docx and Bug54771b.docx test slightly 
+        //different recursion patterns. Keep both!
+        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx");
+        List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
+        String text = sdts.get(0).getContent().getText();
+        assertEquals(2, sdts.size()); 
+        assertTrue(text.indexOf("Test") > -1);
+
+        text = sdts.get(1).getContent().getText();
+        assertTrue(text.indexOf("Test Subtitle") > -1);
+        assertTrue(text.indexOf("Test User") > -1);
+        assertTrue(text.indexOf("Test") < text.indexOf("Test Subtitle"));
+
+        doc = XWPFTestDataSamples.openSampleDocument("Bug54771b.docx");
+        sdts = extractAllSDTs(doc);
+        assertEquals(3, sdts.size()); 
+        assertTrue(sdts.get(0).getContent().getText().indexOf("Test") > -1);
+
+        assertTrue(sdts.get(1).getContent().getText().indexOf("Test Subtitle") > -1);
+        assertTrue(sdts.get(2).getContent().getText().indexOf("Test User") > -1);

-    public void testFailureToGetSDTAsCell() throws Exception{
-        /**
-         * The current code fails to extract an sdt if it comprises/is the parent
-         * of a cell in a table.
-         */
-        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
-        List<XWPFSDT> sdts = extractAllSDTs(doc);
-        boolean found = false;
-        for (XWPFSDT sdt : sdts){
-            if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){
-                found = true;
-            }
-        }
-        assertEquals("SDT as cell known failure", false, found);
    }
    
    /**
@ -99,7 +112,7 @@ public final class TestXWPFSDT extends TestCase {
     */
    public void testNewLinesBetweenRuns() throws Exception{
       XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx");
-       List<XWPFSDT> sdts = extractAllSDTs(doc);
+       List<AbstractXWPFSDT> sdts = extractAllSDTs(doc);
       List<String> targs = new ArrayList<String>();
       //these test newlines and tabs in paragraphs/body elements
       targs.add("Rich-text1 abcdefghi");
@ -114,14 +127,14 @@ public final class TestXWPFSDT extends TestCase {
       targs.add("sdt_incell2 abcdefg");
       
       for (int i = 0; i < sdts.size(); i++){
-          XWPFSDT sdt = sdts.get(i);
+          AbstractXWPFSDT sdt = sdts.get(i);
          assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText());
       }
    }

-    private List<XWPFSDT> extractAllSDTs(XWPFDocument doc){
-
-        List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
+    private List<AbstractXWPFSDT> extractAllSDTs(XWPFDocument doc){
+        
+        List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();

        List<XWPFHeader> headers = doc.getHeaderList();
        for (XWPFHeader header : headers){
@ -135,7 +148,6 @@ public final class TestXWPFSDT extends TestCase {
        }

        for (XWPFFootnote footnote : doc.getFootnotes()){
-
            sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
        }
        for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
@ -144,8 +156,8 @@ public final class TestXWPFSDT extends TestCase {
        return sdts;
    }

-    private List<XWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
-        List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
+    private List<AbstractXWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
+        List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
        for (IBodyElement e : elements){
            if (e instanceof XWPFSDT){
                XWPFSDT sdt = (XWPFSDT)e;
@ -167,11 +179,16 @@ public final class TestXWPFSDT extends TestCase {
        return sdts;
    }

-    private List<XWPFSDT> extractSDTsFromTable(XWPFTable table){
-        List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
-        for (XWPFTableRow r : table.getRows()){
-            for (XWPFTableCell c : r.getTableCells()){
-                sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements()));
+    private List<AbstractXWPFSDT> extractSDTsFromTable(XWPFTable table) {
+
+        List<AbstractXWPFSDT> sdts = new ArrayList<AbstractXWPFSDT>();
+        for (XWPFTableRow r : table.getRows()) {
+            for (ICell c : r.getTableICells()) {
+                if (c instanceof XWPFSDTCell) {
+                    sdts.add((XWPFSDTCell)c);
+                } else if (c instanceof XWPFTableCell) {
+                    sdts.addAll(extractSDTsFromBodyElements(((XWPFTableCell)c).getBodyElements()));
+                }
            }
        }
        return sdts;
--- a/test-data/document/Bug54771a.docx
+++ b/test-data/document/Bug54771a.docx
--- a/test-data/document/Bug54771b.docx
+++ b/test-data/document/Bug54771b.docx