diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index bc0c914f7..0f4a52e66 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi.xwpf.extractor; import java.io.IOException; import java.util.Iterator; +import java.util.List; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLException; @@ -26,13 +27,18 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; +import org.apache.poi.xwpf.usermodel.IBodyElement; +import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFHyperlink; import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.poi.xwpf.usermodel.XWPFRun; +import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.poi.xwpf.usermodel.XWPFTable; +import org.apache.poi.xwpf.usermodel.XWPFTableCell; +import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; @@ -86,59 +92,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { // Start out with all headers extractHeaders(text, hfPolicy); - // First up, all our paragraph based text - Iterator i = document.getParagraphsIterator(); - while(i.hasNext()) { - XWPFParagraph paragraph = i.next(); - - try { - CTSectPr ctSectPr = null; - if (paragraph.getCTP().getPPr()!=null) { - ctSectPr = paragraph.getCTP().getPPr().getSectPr(); - } - - XWPFHeaderFooterPolicy headerFooterPolicy = null; - - if (ctSectPr!=null) { - headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); - extractHeaders(text, headerFooterPolicy); - } - - // Do the paragraph text - for(XWPFRun run : paragraph.getRuns()) { - text.append(run.toString()); - if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) { - XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document); - if(link != null) - text.append(" <" + link.getURL() + ">"); - } - } - - // Add comments - XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null); - text.append(decorator.getCommentText()).append('\n'); - - // Do endnotes and footnotes - String footnameText = paragraph.getFootnoteText(); - if(footnameText != null && footnameText.length() > 0) { - text.append(footnameText + "\n"); - } - - if (ctSectPr!=null) { - extractFooters(text, headerFooterPolicy); - } - } catch (IOException e) { - throw new POIXMLException(e); - } catch (XmlException e) { - throw new POIXMLException(e); - } - } - - // Then our table based text - Iterator j = document.getTablesIterator(); - while(j.hasNext()) { - text.append(j.next().getText()).append('\n'); - } + // body elements + for (IBodyElement e : document.getBodyElements()){ + appendBodyElementText(text, e); + text.append('\n'); + } // Finish up with all the footers extractFooters(text, hfPolicy); @@ -146,6 +104,79 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { return text.toString(); } + public void appendBodyElementText(StringBuffer text, IBodyElement e){ + if (e instanceof XWPFParagraph){ + appendParagraphText(text, (XWPFParagraph)e); + } else if (e instanceof XWPFTable){ + appendTableText(text, (XWPFTable)e); + } else if (e instanceof XWPFSDT){ + text.append(((XWPFSDT)e).getContent().getText()); + } + } + + public void appendParagraphText(StringBuffer text, XWPFParagraph paragraph){ + try { + CTSectPr ctSectPr = null; + if (paragraph.getCTP().getPPr()!=null) { + ctSectPr = paragraph.getCTP().getPPr().getSectPr(); + } + + XWPFHeaderFooterPolicy headerFooterPolicy = null; + + if (ctSectPr!=null) { + headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); + extractHeaders(text, headerFooterPolicy); + } + + + for(IRunElement run : paragraph.getRuns()) { + text.append(run.toString()); + if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) { + XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document); + if(link != null) + text.append(" <" + link.getURL() + ">"); + } + } + + // Add comments + XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null); + String commentText = decorator.getCommentText(); + if (commentText.length() > 0){ + text.append(commentText).append('\n'); + } + + // Do endnotes and footnotes + String footnameText = paragraph.getFootnoteText(); + if(footnameText != null && footnameText.length() > 0) { + text.append(footnameText + '\n'); + } + + if (ctSectPr!=null) { + extractFooters(text, headerFooterPolicy); + } + } catch (IOException e) { + throw new POIXMLException(e); + } catch (XmlException e) { + throw new POIXMLException(e); + } + + } + + private void appendTableText(StringBuffer text, XWPFTable table){ + //this works recursively to pull embedded tables from tables + for (XWPFTableRow row : table.getRows()){ + List cells = row.getTableCells(); + for (int i = 0; i < cells.size(); i++){ + XWPFTableCell cell = cells.get(i); + text.append(cell.getTextRecursively()); + if (i < cells.size()-1){ + text.append("\t"); + } + } + text.append('\n'); + } + } + private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { if(hfPolicy.getFirstPageFooter() != null) { text.append( hfPolicy.getFirstPageFooter().getText() ); diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyElementType.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyElementType.java index 3df31061a..160c637bd 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyElementType.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyElementType.java @@ -28,7 +28,8 @@ package org.apache.poi.xwpf.usermodel; * */ public enum BodyElementType { - PARAGRAPH, + CONTENTCONTROL, + PARAGRAPH, TABLE, } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyType.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyType.java index 6aa945321..39e4758b6 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyType.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/BodyType.java @@ -21,6 +21,7 @@ package org.apache.poi.xwpf.usermodel; * The different kinds of {@link IBody} that exist */ public enum BodyType { + CONTENTCONTROL, DOCUMENT, HEADER, FOOTER, diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunBody.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunBody.java new file mode 100644 index 000000000..55510ce4d --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunBody.java @@ -0,0 +1,28 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import org.apache.poi.POIXMLDocumentPart; + +/** + * Simple interface describing both {@link XWPFParagraph} + * and {@link XWPFSDT} + */ +public interface IRunBody { + public XWPFDocument getDocument(); + public POIXMLDocumentPart getPart(); +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunElement.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunElement.java new file mode 100644 index 000000000..6d845364b --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/IRunElement.java @@ -0,0 +1,26 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +/** + * Common interface for things that can occur + * where a run (text with common stylings) can, + * eg {@link XWPFRun} or {@link XWPFSDT}. + * More methods to follow shortly! + */ +public interface IRunElement { +} \ No newline at end of file diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java new file mode 100644 index 000000000..7ee12bd2d --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java @@ -0,0 +1,25 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +/** + * Interface for anything that can be within a STD: + * {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph}, + * {@link XWPFSDT} etc + */ +public interface ISDTContents { +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java index 0f79a6e6b..d5f548786 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java @@ -98,6 +98,7 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody { protected List hyperlinks = new ArrayList(); protected List paragraphs = new ArrayList(); protected List tables = new ArrayList(); + protected List contentControls = new ArrayList(); protected List bodyElements = new ArrayList(); protected List pictures = new ArrayList(); protected Map> packagePictures = new HashMap>(); @@ -150,7 +151,11 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody { XWPFTable t = new XWPFTable((CTTbl) o, this); bodyElements.add(t); tables.add(t); - } + } else if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this); + bodyElements.add(c); + contentControls.add(c); + } } cursor.dispose(); @@ -230,10 +235,10 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody { for(POIXMLDocumentPart p : getRelations()){ String relation = p.getPackageRelationship().getRelationshipType(); if (relation.equals(XWPFRelation.FOOTNOTE.getRelation())) { - FootnotesDocument footnotesDocument = FootnotesDocument.Factory.parse(p.getPackagePart().getInputStream()); this.footnotes = (XWPFFootnotes)p; this.footnotes.onDocumentRead(); - + // Warning - this apparently doubles footnotes - see bug #???? + FootnotesDocument footnotesDocument = FootnotesDocument.Factory.parse(p.getPackagePart().getInputStream()); for(CTFtnEdn ctFtnEdn : footnotesDocument.getFootnotes().getFootnoteList()) { footnotes.addFootnote(ctFtnEdn); } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFooter.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFooter.java index c6f668162..8485f1d6c 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFooter.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFooter.java @@ -34,6 +34,7 @@ import org.apache.xmlbeans.XmlOptions; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumbering; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.FtrDocument; @@ -61,6 +62,7 @@ public class XWPFFooter extends XWPFHeaderFooter { tables.add(t); bodyElements.add(t); } + } cursor.dispose(); } @@ -118,6 +120,10 @@ public class XWPFFooter extends XWPFHeaderFooter { tables.add(t); bodyElements.add(t); } + if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this); + bodyElements.add(c); + } } cursor.dispose(); } catch (Exception e) { diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFootnote.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFootnote.java index 96199aa29..7ecb4d77c 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFootnote.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFFootnote.java @@ -26,6 +26,7 @@ import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdn; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc; @@ -37,20 +38,44 @@ public class XWPFFootnote implements Iterable,IBody { private CTFtnEdn ctFtnEdn; private XWPFFootnotes footnotes; + private XWPFDocument document; public XWPFFootnote(CTFtnEdn note, XWPFFootnotes xFootnotes) { footnotes = xFootnotes; ctFtnEdn = note; - for (CTP p : ctFtnEdn.getPList()) { - paragraphs.add(new XWPFParagraph(p, this)); - } + document = xFootnotes.getXWPFDocument(); + init(); } public XWPFFootnote(XWPFDocument document, CTFtnEdn body) { - for (CTP p : body.getPList()) { - paragraphs.add(new XWPFParagraph(p, document)); - } + ctFtnEdn = body; + this.document = document; + init(); } + + private void init(){ + XmlCursor cursor = ctFtnEdn.newCursor(); + //copied from XWPFDocument...should centralize this code + //to avoid duplication + cursor.selectPath("./*"); + while (cursor.toNextSelection()) { + XmlObject o = cursor.getObject(); + if (o instanceof CTP) { + XWPFParagraph p = new XWPFParagraph((CTP) o, this); + bodyElements.add(p); + paragraphs.add(p); + } else if (o instanceof CTTbl) { + XWPFTable t = new XWPFTable((CTTbl) o, this); + bodyElements.add(t); + tables.add(t); + } else if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this); + bodyElements.add(c); + } + + } + cursor.dispose(); + } public List getParagraphs() { return paragraphs; @@ -314,7 +339,7 @@ public class XWPFFootnote implements Iterable,IBody { * @see org.apache.poi.xwpf.usermodel.IBody#getXWPFDocument() */ public XWPFDocument getXWPFDocument() { - return footnotes.getXWPFDocument(); + return document; } /** diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeader.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeader.java index f18e0c7bd..a07bf4141 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeader.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeader.java @@ -35,6 +35,7 @@ import org.apache.xmlbeans.XmlOptions; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumbering; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.HdrDocument; @@ -121,6 +122,10 @@ public class XWPFHeader extends XWPFHeaderFooter { tables.add(t); bodyElements.add(t); } + if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this); + bodyElements.add(c); + } } cursor.dispose(); } catch (XmlException e) { diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java index 7a2dc9a74..fa65abb2f 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java @@ -129,7 +129,7 @@ public abstract class XWPFHeaderFooter extends POIXMLDocumentPart implements IBo */ public String getText() { StringBuffer t = new StringBuffer(); - + //TODO: simplify this to get ibody elements in order for(int i=0; i */ -public class XWPFParagraph implements IBodyElement { +public class XWPFParagraph implements IBodyElement, IRunBody, ISDTContents { private final CTP paragraph; protected IBody part; /** For access to the document's hyperlink, comments, tables etc */ protected XWPFDocument document; protected List runs; + protected List iruns; private StringBuffer footnoteText = new StringBuffer(); @@ -82,6 +84,7 @@ public class XWPFParagraph implements IBodyElement { // Build up the character runs runs = new ArrayList(); + iruns = new ArrayList(); buildRunsInOrderFromXml(paragraph); // Look for bits associated with the runs @@ -96,7 +99,7 @@ public class XWPFParagraph implements IBodyElement { XmlObject o = c.getObject(); if(o instanceof CTFtnEdnRef) { CTFtnEdnRef ftn = (CTFtnEdnRef)o; - footnoteText.append("[").append(ftn.getId()).append(": "); + footnoteText.append(" [").append(ftn.getId()).append(": "); XWPFFootnote footnote = ftn.getDomNode().getLocalName().equals("footnoteReference") ? document.getFootnoteByID(ftn.getId().intValue()) : @@ -111,7 +114,7 @@ public class XWPFParagraph implements IBodyElement { footnoteText.append(p.getText()); } - footnoteText.append("]"); + footnoteText.append("] "); } } c.dispose(); @@ -129,30 +132,40 @@ public class XWPFParagraph implements IBodyElement { while (c.toNextSelection()) { XmlObject o = c.getObject(); if (o instanceof CTR) { - runs.add(new XWPFRun((CTR) o, this)); - } - if (o instanceof CTHyperlink) { - CTHyperlink link = (CTHyperlink) o; - for (CTR r : link.getRList()) { - runs.add(new XWPFHyperlinkRun(link, r, this)); - } - } - if (o instanceof CTSdtRun) { - CTSdtContentRun run = ((CTSdtRun) o).getSdtContent(); - for (CTR r : run.getRList()) { - runs.add(new XWPFRun(r, this)); - } - } - if (o instanceof CTRunTrackChange) { - for (CTR r : ((CTRunTrackChange) o).getRList()) { - runs.add(new XWPFRun(r, this)); - } - } - if (o instanceof CTSimpleField) { - for (CTR r : ((CTSimpleField) o).getRList()) { - runs.add(new XWPFRun(r, this)); - } - } + XWPFRun r = new XWPFRun((CTR) o, this); + runs.add(r); + iruns.add(r); + } + if (o instanceof CTHyperlink) { + CTHyperlink link = (CTHyperlink) o; + for (CTR r : link.getRList()) { + XWPFHyperlinkRun hr = new XWPFHyperlinkRun(link, r, this); + runs.add(hr); + iruns.add(hr); + } + } + if (o instanceof CTSdtBlock) { + XWPFSDT cc = new XWPFSDT((CTSdtBlock) o, part); + iruns.add(cc); + } + if (o instanceof CTSdtRun) { + XWPFSDT cc = new XWPFSDT((CTSdtRun) o, part); + iruns.add(cc); + } + if (o instanceof CTRunTrackChange) { + for (CTR r : ((CTRunTrackChange) o).getRList()) { + XWPFRun cr = new XWPFRun(r, this); + runs.add(cr); + iruns.add(cr); + } + } + if (o instanceof CTSimpleField) { + for (CTR r : ((CTSimpleField) o).getRList()) { + XWPFRun cr = new XWPFRun(r, this); + runs.add(cr); + iruns.add(cr); + } + } if (o instanceof CTSmartTagRun) { // Smart Tags can be nested many times. // This implementation does not preserve the tagging information @@ -170,7 +183,15 @@ public class XWPFParagraph implements IBodyElement { public List getRuns(){ return Collections.unmodifiableList(runs); } - + + /** + * Return literal runs and sdt/content control objects. + * @return List + */ + public List getIRuns() { + return Collections.unmodifiableList(iruns); + } + public boolean isEmpty(){ return !paragraph.getDomNode().hasChildNodes(); } @@ -181,12 +202,16 @@ public class XWPFParagraph implements IBodyElement { /** * Return the textual content of the paragraph, including text from pictures - * in it. + * and sdt elements in it. */ public String getText() { StringBuffer out = new StringBuffer(); - for(XWPFRun run : runs) { - out.append(run.toString()); + for (IRunElement run : iruns) { + if (run instanceof XWPFSDT){ + out.append(((XWPFSDT)run).getContent().getText()); + } else { + out.append(run.toString()); + } } out.append(footnoteText); return out.toString(); diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFPicture.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFPicture.java index 7dec18513..eba552cc2 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFPicture.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFPicture.java @@ -67,7 +67,7 @@ public class XWPFPicture { } String blipId = blipProps.getBlip().getEmbed(); - POIXMLDocumentPart part = run.getParagraph().getPart(); + POIXMLDocumentPart part = run.getParent().getPart(); if (part != null) { POIXMLDocumentPart relatedPart = part.getRelationById(blipId); diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java index d6bc531bf..3e25656f0 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java @@ -74,24 +74,20 @@ import org.openxmlformats.schemas.drawingml.x2006.picture.CTPictureNonVisual; /** * XWPFRun object defines a region of text with a common set of properties - * - * @author Yegor Kozlov - * @author Gregg Morris (gregg dot morris at gmail dot com) - added getColor(), setColor() - * */ -public class XWPFRun { +public class XWPFRun implements ISDTContents, IRunElement{ private CTR run; private String pictureText; - private XWPFParagraph paragraph; + private IRunBody parent; private List pictures; /** * @param r the CTR bean which holds the run attributes * @param p the parent paragraph */ - public XWPFRun(CTR r, XWPFParagraph p) { + public XWPFRun(CTR r, IRunBody p) { this.run = r; - this.paragraph = p; + this.parent = p; /** * reserve already occupied drawing ids, so reserving new ids later will @@ -143,6 +139,12 @@ public class XWPFRun { } } } + /** + * @deprecated Use {@link XWPFRun#XWPFRun(CTR, IRunBody)} + */ + public XWPFRun(CTR r, XWPFParagraph p) { + this(r, (IRunBody)p); + } private List getCTPictures(XmlObject o) { List pictures = new ArrayList(); @@ -173,11 +175,20 @@ public class XWPFRun { } /** - * Get the currenty referenced paragraph object - * @return current paragraph + * Get the currently referenced paragraph/SDT object + * @return current parent + */ + public IRunBody getParent() { + return parent; + } + /** + * Get the currently referenced paragraph, or null if a SDT object + * @deprecated use {@link XWPFRun#getParent()} instead */ public XWPFParagraph getParagraph() { - return paragraph; + if (parent instanceof XWPFParagraph) + return (XWPFParagraph)parent; + return null; } /** @@ -185,8 +196,8 @@ public class XWPFRun { * null if parent structure (paragraph > document) is not properly set. */ public XWPFDocument getDocument() { - if (paragraph != null) { - return paragraph.getDocument(); + if (parent != null) { + return parent.getDocument(); } return null; } @@ -663,7 +674,7 @@ public class XWPFRun { */ public XWPFPicture addPicture(InputStream pictureData, int pictureType, String filename, int width, int height) throws InvalidFormatException, IOException { - XWPFDocument doc = paragraph.document; + XWPFDocument doc = parent.getDocument(); // Add the picture + relationship String relationId = doc.addPictureData(pictureData, pictureType); @@ -691,7 +702,7 @@ public class XWPFRun { inline.setDistL(0); CTNonVisualDrawingProps docPr = inline.addNewDocPr(); - long id = getParagraph().document.getDrawingIdManager().reserveNew(); + long id = getParent().getDocument().getDrawingIdManager().reserveNew(); docPr.setId(id); /* This name is not visible in Word 2010 anywhere. */ docPr.setName("Drawing " + id); diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java new file mode 100644 index 000000000..a17d51f8d --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java @@ -0,0 +1,110 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import java.util.List; + +import org.apache.poi.POIXMLDocumentPart; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString; + +/** + * Experimental class to offer rudimentary read-only processing of + * of StructuredDocumentTags/ContentControl + * + * + * + * WARNING - APIs expected to change rapidly + * + */ +public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement { + private final String title; + private final String tag; + private final XWPFSDTContent content; + private final IBody part; + + public XWPFSDT(CTSdtRun sdtRun, IBody part){ + this.part = part; + this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this); + CTSdtPr pr = sdtRun.getSdtPr(); + List aliases = pr.getAliasList(); + if (aliases != null && aliases.size() > 0){ + title = aliases.get(0).getVal(); + } else { + title = ""; + } + @SuppressWarnings("deprecation") + CTString[] array = pr.getTagArray(); + if (array != null && array.length > 0){ + tag = array[0].getVal(); + } else { + tag = ""; + } + + } + public XWPFSDT(CTSdtBlock block, IBody part){ + this.part = part; + this.content = new XWPFSDTContent( block.getSdtContent(), part, this); + CTSdtPr pr = block.getSdtPr(); + List aliases = pr.getAliasList(); + if (aliases != null && aliases.size() > 0){ + title = aliases.get(0).getVal(); + } else { + title = ""; + } + @SuppressWarnings("deprecation") + CTString[] array = pr.getTagArray(); + if (array != null && array.length > 0){ + tag = array[0].getVal(); + } else { + tag = ""; + } + + } + public String getTitle(){ + return title; + } + public String getTag(){ + return tag; + } + public XWPFSDTContent getContent(){ + return content; + } + + public IBody getBody() { + // TODO Auto-generated method stub + return null; + } + + public POIXMLDocumentPart getPart() { + return part.getPart(); + } + + public BodyType getPartType() { + return BodyType.CONTENTCONTROL; + } + + public BodyElementType getElementType() { + return BodyElementType.CONTENTCONTROL; + } + + public XWPFDocument getDocument() { + return part.getXWPFDocument(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java new file mode 100644 index 000000000..39b385396 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import java.util.ArrayList; +import java.util.List; + + +import org.apache.xmlbeans.XmlCursor; +import org.apache.xmlbeans.XmlObject; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; + +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; + +/** + * Experimental class to offer rudimentary read-only processing of + * of the contentblock of an SDT/ContentControl. + * + * + * + * WARNING - APIs expected to change rapidly + * + */ +public class XWPFSDTContent { + + // private final IBody part; + // private final XWPFDocument document; + private List paragraphs = new ArrayList(); + private List tables = new ArrayList(); + private List runs = new ArrayList(); + private List contentControls = new ArrayList(); + private List bodyElements = new ArrayList(); + + public XWPFSDTContent(CTSdtContentRun sdtRun, IBody part, IRunBody parent){ + for (CTR ctr : sdtRun.getRList()){ + XWPFRun run = new XWPFRun((CTR) ctr, parent); + runs.add(run); + bodyElements.add(run); + } + } + public XWPFSDTContent(CTSdtContentBlock block, IBody part, IRunBody parent){ + XmlCursor cursor = block.newCursor(); + cursor.selectPath("./*"); + while (cursor.toNextSelection()) { + XmlObject o = cursor.getObject(); + if (o instanceof CTP) { + XWPFParagraph p = new XWPFParagraph((CTP) o, part); + bodyElements.add(p); + paragraphs.add(p); + } else if (o instanceof CTTbl) { + XWPFTable t = new XWPFTable((CTTbl) o, part); + bodyElements.add(t); + tables.add(t); + } else if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT(((CTSdtBlock)o), part); + bodyElements.add(c); + contentControls.add(c); + } else if (o instanceof CTR) { + XWPFRun run = new XWPFRun((CTR) o, parent); + runs.add(run); + bodyElements.add(run); + } + } + } + + public String getText(){ + StringBuilder text = new StringBuilder(); + for (int i = 0; i < bodyElements.size(); i++){ + Object o = bodyElements.get(i); + if (o instanceof XWPFParagraph){ + text.append(((XWPFParagraph)o).getText()); + } else if (o instanceof XWPFTable){ + text.append(((XWPFTable)o).getText()); + } else if (o instanceof XWPFSDT){ + text.append(((XWPFSDT)o).getContent().getText()); + } else if (o instanceof XWPFRun){ + text.append(((XWPFRun)o).toString()); + } + if (i < bodyElements.size()-1){ + text.append("\n"); + } + } + return text.toString(); + } + + public String toString(){ + return getText(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java index 8152b25c9..c2b1c175f 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java @@ -43,7 +43,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTblWidth; *

Specifies the contents of a table present in the document. A table is a set * of paragraphs (and other block-level content) arranged in rows and columns.

*/ -public class XWPFTable implements IBodyElement { +public class XWPFTable implements IBodyElement, ISDTContents { protected StringBuffer text = new StringBuffer(); private CTTbl ctTbl; protected List tableRows; diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java index 192add4d2..148bd2072 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java @@ -28,6 +28,8 @@ import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTShd; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc; @@ -97,6 +99,15 @@ public class XWPFTableCell implements IBody { tables.add(t); bodyElements.add(t); } + if (o instanceof CTSdtBlock){ + XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this); + bodyElements.add(c); + } + if (o instanceof CTSdtRun){ + XWPFSDT c = new XWPFSDT((CTSdtRun)o, this); + System.out.println(c.getContent().getText()); + bodyElements.add(c); + } } cursor.dispose(); } @@ -407,6 +418,48 @@ public class XWPFTableCell implements IBody { return text.toString(); } + /** + * extracts all text recursively through embedded tables and embedded SDTs + */ + public String getTextRecursively(){ + + StringBuffer text = new StringBuffer(); + for (int i = 0; i < bodyElements.size(); i++){ + boolean isLast = (i== bodyElements.size()-1)? true : false; + appendBodyElementText(text, bodyElements.get(i), isLast); + } + + return text.toString(); + } + + private void appendBodyElementText(StringBuffer text, IBodyElement e, boolean isLast){ + if (e instanceof XWPFParagraph){ + text.append(((XWPFParagraph)e).getText()); + if (isLast == false){ + text.append('\t'); + } + } else if (e instanceof XWPFTable){ + XWPFTable eTable = (XWPFTable)e; + for (XWPFTableRow row : eTable.getRows()){ + for (XWPFTableCell cell : row.getTableCells()){ + List localBodyElements = cell.getBodyElements(); + for (int i = 0; i < localBodyElements.size(); i++){ + boolean localIsLast = (i== localBodyElements.size()-1)? true : false; + appendBodyElementText(text, localBodyElements.get(i), localIsLast); + } + } + } + + if (isLast == false){ + text.append('\n'); + } + } else if (e instanceof XWPFSDT){ + text.append(((XWPFSDT)e).getContent().getText()); + if (isLast == false){ + text.append('\t'); + } + } + } /** * get the TableCell which belongs to the TableCell diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java index 0d042d591..a16b247a5 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java @@ -132,6 +132,9 @@ public class XWPFTableRow { for (CTTc tableCell : ctRow.getTcList()) { cells.add(new XWPFTableCell(tableCell, this, table.getBody())); } + //TODO: it is possible to have an SDT that contains a cell in within a row + //need to modify this code so that it pulls out SDT wrappers around cells, too. + this.tableCells = cells; } return tableCells; diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index f62749f52..9ad3cc102 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -80,10 +80,10 @@ public class TestXWPFWordExtractor extends TestCase { " \n(V) ILLUSTRATIVE CASES\n\n" )); assertTrue(text.contains( - "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" + "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n" )); assertTrue(text.endsWith( - "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n" + "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n" )); // Check number of paragraphs @@ -317,4 +317,39 @@ public class TestXWPFWordExtractor extends TestCase { extractor.close(); } + + /** + * Test for basic extraction of SDT content + * @throws IOException + */ + public void testSimpleControlContent() throws IOException { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); + String[] targs = new String[]{ + "header_rich_text", + "rich_text", + "rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table", + "plain_text_no_newlines", + "plain_text_with_newlines1\nplain_text_with_newlines2\n", + "watermelon\n", + "dirt\n", + "4/16/2013\n", + "rich_text_in_paragraph_in_cell", + "footer_rich_text", + "footnote_sdt", + "endnote_sdt" + }; + XWPFWordExtractor ex = new XWPFWordExtractor(doc); + String s = ex.getText().toLowerCase(); + int hits = 0; + + for (String targ : targs){ + boolean hit = false; + if (s.indexOf(targ) > -1){ + hit = true; + hits++; + } + assertEquals("controlled content loading-"+targ, true, hit); + } + assertEquals("controlled content loading hit count", targs.length, hits); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java b/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java new file mode 100644 index 000000000..5a25e9e2c --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java @@ -0,0 +1,158 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xwpf.usermodel; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import junit.framework.TestCase; + +import org.apache.poi.xwpf.XWPFTestDataSamples; + +public final class TestXWPFSDT extends TestCase { + + /** + * Test simple tag and title extraction from SDT + * @throws Exception + */ + public void testTagTitle() throws Exception { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); + String tag = null; + String title= null; + List sdts = extractAllSDTs(doc); + for (XWPFSDT sdt :sdts){ + if (sdt.getContent().toString().equals("Rich_text")){ + tag = "MyTag"; + title = "MyTitle"; + break; + } + } + // TODO Fix footnotes issues then enable +// assertEquals("controls size", 12, sdts.size()); + + assertEquals("tag", "MyTag", tag); + assertEquals("title", "MyTitle", title); + } + + + public void testGetSDTs() throws Exception{ + String[] contents = new String[]{ + "header_rich_text", + "Rich_text", + "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table", + "Plain_text_no_newlines", + "Plain_text_with_newlines1\nplain_text_with_newlines2", + "Watermelon", + "Dirt", + "4/16/2013", + "rich_text_in_paragraph_in_cell", + "Footer_rich_text", + "Footnote_sdt", + "Endnote_sdt" + + }; + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); + List sdts = extractAllSDTs(doc); + + // TODO Fix footnotes issue +/* + assertEquals("number of sdts", contents.length, sdts.size()); + + for (int i = 0; i < sdts.size(); i++){//contents.length; i++){ + XWPFSDT sdt = sdts.get(i); + + assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString()); + } +*/ + } + + public void testFailureToGetSDTAsCell() throws Exception{ + /** + * The current code fails to extract an sdt if it comprises/is the parent + * of a cell in a table. + */ + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); + List sdts = extractAllSDTs(doc); + boolean found = false; + for (XWPFSDT sdt : sdts){ + if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){ + found = true; + } + } + assertEquals("SDT as cell known failure", false, found); + } + + private List extractAllSDTs(XWPFDocument doc){ + + List sdts = new ArrayList(); + + List headers = doc.getHeaderList(); + for (XWPFHeader header : headers){ + sdts.addAll(extractSDTsFromBodyElements(header.getBodyElements())); + } + sdts.addAll(extractSDTsFromBodyElements(doc.getBodyElements())); + + List footers = doc.getFooterList(); + for (XWPFFooter footer : footers){ + sdts.addAll(extractSDTsFromBodyElements(footer.getBodyElements())); + } + + for (XWPFFootnote footnote : doc.getFootnotes()){ + + sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements())); + } + for (Map.Entry e : doc.endnotes.entrySet()){ + sdts.addAll(extractSDTsFromBodyElements(e.getValue().getBodyElements())); + } + return sdts; + } + + private List extractSDTsFromBodyElements(List elements){ + List sdts = new ArrayList(); + for (IBodyElement e : elements){ + if (e instanceof XWPFSDT){ + XWPFSDT sdt = (XWPFSDT)e; + sdts.add(sdt); + } else if (e instanceof XWPFParagraph){ + + XWPFParagraph p = (XWPFParagraph)e; + for (IRunElement e2 : p.getIRuns()){ + if (e2 instanceof XWPFSDT){ + XWPFSDT sdt = (XWPFSDT)e2; + sdts.add(sdt); + } + } + } else if (e instanceof XWPFTable){ + XWPFTable table = (XWPFTable)e; + sdts.addAll(extractSDTsFromTable(table)); + } + } + return sdts; + } + + private List extractSDTsFromTable(XWPFTable table){ + List sdts = new ArrayList(); + for (XWPFTableRow r : table.getRows()){ + for (XWPFTableCell c : r.getTableCells()){ + sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements())); + } + } + return sdts; + } +}