Patch from Tim Allison from bug #54849 - Controlled content/Form (Std/StdBlock) content

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1494376 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2013-06-18 23:35:11 +00:00
parent 426a6f3451
commit 58b92caebe
22 changed files with 777 additions and 117 deletions

View File

@ -18,6 +18,7 @@ package org.apache.poi.xwpf.extractor;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLException;
@ -26,13 +27,18 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.IRunElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
@ -86,59 +92,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
// Start out with all headers
extractHeaders(text, hfPolicy);
// First up, all our paragraph based text
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
while(i.hasNext()) {
XWPFParagraph paragraph = i.next();
try {
CTSectPr ctSectPr = null;
if (paragraph.getCTP().getPPr()!=null) {
ctSectPr = paragraph.getCTP().getPPr().getSectPr();
}
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (ctSectPr!=null) {
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(text, headerFooterPolicy);
}
// Do the paragraph text
for(XWPFRun run : paragraph.getRuns()) {
text.append(run.toString());
if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
if(link != null)
text.append(" <" + link.getURL() + ">");
}
}
// Add comments
XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
text.append(decorator.getCommentText()).append('\n');
// Do endnotes and footnotes
String footnameText = paragraph.getFootnoteText();
if(footnameText != null && footnameText.length() > 0) {
text.append(footnameText + "\n");
}
if (ctSectPr!=null) {
extractFooters(text, headerFooterPolicy);
}
} catch (IOException e) {
throw new POIXMLException(e);
} catch (XmlException e) {
throw new POIXMLException(e);
}
}
// Then our table based text
Iterator<XWPFTable> j = document.getTablesIterator();
while(j.hasNext()) {
text.append(j.next().getText()).append('\n');
}
// body elements
for (IBodyElement e : document.getBodyElements()){
appendBodyElementText(text, e);
text.append('\n');
}
// Finish up with all the footers
extractFooters(text, hfPolicy);
@ -146,6 +104,79 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
return text.toString();
}
public void appendBodyElementText(StringBuffer text, IBodyElement e){
if (e instanceof XWPFParagraph){
appendParagraphText(text, (XWPFParagraph)e);
} else if (e instanceof XWPFTable){
appendTableText(text, (XWPFTable)e);
} else if (e instanceof XWPFSDT){
text.append(((XWPFSDT)e).getContent().getText());
}
}
public void appendParagraphText(StringBuffer text, XWPFParagraph paragraph){
try {
CTSectPr ctSectPr = null;
if (paragraph.getCTP().getPPr()!=null) {
ctSectPr = paragraph.getCTP().getPPr().getSectPr();
}
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (ctSectPr!=null) {
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(text, headerFooterPolicy);
}
for(IRunElement run : paragraph.getRuns()) {
text.append(run.toString());
if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
if(link != null)
text.append(" <" + link.getURL() + ">");
}
}
// Add comments
XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
String commentText = decorator.getCommentText();
if (commentText.length() > 0){
text.append(commentText).append('\n');
}
// Do endnotes and footnotes
String footnameText = paragraph.getFootnoteText();
if(footnameText != null && footnameText.length() > 0) {
text.append(footnameText + '\n');
}
if (ctSectPr!=null) {
extractFooters(text, headerFooterPolicy);
}
} catch (IOException e) {
throw new POIXMLException(e);
} catch (XmlException e) {
throw new POIXMLException(e);
}
}
private void appendTableText(StringBuffer text, XWPFTable table){
//this works recursively to pull embedded tables from tables
for (XWPFTableRow row : table.getRows()){
List<XWPFTableCell> cells = row.getTableCells();
for (int i = 0; i < cells.size(); i++){
XWPFTableCell cell = cells.get(i);
text.append(cell.getTextRecursively());
if (i < cells.size()-1){
text.append("\t");
}
}
text.append('\n');
}
}
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
if(hfPolicy.getFirstPageFooter() != null) {
text.append( hfPolicy.getFirstPageFooter().getText() );

View File

@ -28,7 +28,8 @@ package org.apache.poi.xwpf.usermodel;
*
*/
public enum BodyElementType {
PARAGRAPH,
CONTENTCONTROL,
PARAGRAPH,
TABLE,
}

View File

@ -21,6 +21,7 @@ package org.apache.poi.xwpf.usermodel;
* The different kinds of {@link IBody} that exist
*/
public enum BodyType {
CONTENTCONTROL,
DOCUMENT,
HEADER,
FOOTER,

View File

@ -0,0 +1,28 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.apache.poi.POIXMLDocumentPart;
/**
* Simple interface describing both {@link XWPFParagraph}
* and {@link XWPFSDT}
*/
public interface IRunBody {
public XWPFDocument getDocument();
public POIXMLDocumentPart getPart();
}

View File

@ -0,0 +1,26 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
/**
* Common interface for things that can occur
* where a run (text with common stylings) can,
* eg {@link XWPFRun} or {@link XWPFSDT}.
* More methods to follow shortly!
*/
public interface IRunElement {
}

View File

@ -0,0 +1,25 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
/**
* Interface for anything that can be within a STD:
* {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph},
* {@link XWPFSDT} etc
*/
public interface ISDTContents {
}

View File

@ -98,6 +98,7 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
protected List<XWPFHyperlink> hyperlinks = new ArrayList<XWPFHyperlink>();
protected List<XWPFParagraph> paragraphs = new ArrayList<XWPFParagraph>();
protected List<XWPFTable> tables = new ArrayList<XWPFTable>();
protected List<XWPFSDT> contentControls = new ArrayList<XWPFSDT>();
protected List<IBodyElement> bodyElements = new ArrayList<IBodyElement>();
protected List<XWPFPictureData> pictures = new ArrayList<XWPFPictureData>();
protected Map<Long, List<XWPFPictureData>> packagePictures = new HashMap<Long, List<XWPFPictureData>>();
@ -150,7 +151,11 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
XWPFTable t = new XWPFTable((CTTbl) o, this);
bodyElements.add(t);
tables.add(t);
}
} else if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this);
bodyElements.add(c);
contentControls.add(c);
}
}
cursor.dispose();
@ -230,10 +235,10 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
for(POIXMLDocumentPart p : getRelations()){
String relation = p.getPackageRelationship().getRelationshipType();
if (relation.equals(XWPFRelation.FOOTNOTE.getRelation())) {
FootnotesDocument footnotesDocument = FootnotesDocument.Factory.parse(p.getPackagePart().getInputStream());
this.footnotes = (XWPFFootnotes)p;
this.footnotes.onDocumentRead();
// Warning - this apparently doubles footnotes - see bug #????
FootnotesDocument footnotesDocument = FootnotesDocument.Factory.parse(p.getPackagePart().getInputStream());
for(CTFtnEdn ctFtnEdn : footnotesDocument.getFootnotes().getFootnoteList()) {
footnotes.addFootnote(ctFtnEdn);
}

View File

@ -34,6 +34,7 @@ import org.apache.xmlbeans.XmlOptions;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumbering;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.FtrDocument;
@ -61,6 +62,7 @@ public class XWPFFooter extends XWPFHeaderFooter {
tables.add(t);
bodyElements.add(t);
}
}
cursor.dispose();
}
@ -118,6 +120,10 @@ public class XWPFFooter extends XWPFHeaderFooter {
tables.add(t);
bodyElements.add(t);
}
if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this);
bodyElements.add(c);
}
}
cursor.dispose();
} catch (Exception e) {

View File

@ -26,6 +26,7 @@ import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdn;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
@ -37,21 +38,45 @@ public class XWPFFootnote implements Iterable<XWPFParagraph>,IBody {
private CTFtnEdn ctFtnEdn;
private XWPFFootnotes footnotes;
private XWPFDocument document;
public XWPFFootnote(CTFtnEdn note, XWPFFootnotes xFootnotes) {
footnotes = xFootnotes;
ctFtnEdn = note;
for (CTP p : ctFtnEdn.getPList()) {
paragraphs.add(new XWPFParagraph(p, this));
}
document = xFootnotes.getXWPFDocument();
init();
}
public XWPFFootnote(XWPFDocument document, CTFtnEdn body) {
for (CTP p : body.getPList()) {
paragraphs.add(new XWPFParagraph(p, document));
}
ctFtnEdn = body;
this.document = document;
init();
}
private void init(){
XmlCursor cursor = ctFtnEdn.newCursor();
//copied from XWPFDocument...should centralize this code
//to avoid duplication
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject o = cursor.getObject();
if (o instanceof CTP) {
XWPFParagraph p = new XWPFParagraph((CTP) o, this);
bodyElements.add(p);
paragraphs.add(p);
} else if (o instanceof CTTbl) {
XWPFTable t = new XWPFTable((CTTbl) o, this);
bodyElements.add(t);
tables.add(t);
} else if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this);
bodyElements.add(c);
}
}
cursor.dispose();
}
public List<XWPFParagraph> getParagraphs() {
return paragraphs;
}
@ -314,7 +339,7 @@ public class XWPFFootnote implements Iterable<XWPFParagraph>,IBody {
* @see org.apache.poi.xwpf.usermodel.IBody#getXWPFDocument()
*/
public XWPFDocument getXWPFDocument() {
return footnotes.getXWPFDocument();
return document;
}
/**

View File

@ -35,6 +35,7 @@ import org.apache.xmlbeans.XmlOptions;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumbering;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.HdrDocument;
@ -121,6 +122,10 @@ public class XWPFHeader extends XWPFHeaderFooter {
tables.add(t);
bodyElements.add(t);
}
if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this);
bodyElements.add(c);
}
}
cursor.dispose();
} catch (XmlException e) {

View File

@ -129,7 +129,7 @@ public abstract class XWPFHeaderFooter extends POIXMLDocumentPart implements IBo
*/
public String getText() {
StringBuffer t = new StringBuffer();
//TODO: simplify this to get ibody elements in order
for(int i=0; i<paragraphs.size(); i++) {
if(! paragraphs.get(i).isEmpty()) {
String text = paragraphs.get(i).getText();
@ -149,6 +149,11 @@ public abstract class XWPFHeaderFooter extends POIXMLDocumentPart implements IBo
}
}
for (IBodyElement bodyElement : getBodyElements()){
if (bodyElement instanceof XWPFSDT){
t.append(((XWPFSDT) bodyElement).getContent().getText()+'\n');
}
}
return t.toString();
}

View File

@ -27,7 +27,7 @@ public class XWPFHyperlinkRun extends XWPFRun
{
private CTHyperlink hyperlink;
public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, XWPFParagraph p) {
public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, IRunBody p) {
super(run, p);
this.hyperlink = hyperlink;
}

View File

@ -40,6 +40,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProofErr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRunTrackChange;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;
@ -61,12 +62,13 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTextAlignment;
* actual text (possibly along with more styling) is held on
* the child {@link XWPFRun}s.</p>
*/
public class XWPFParagraph implements IBodyElement {
public class XWPFParagraph implements IBodyElement, IRunBody, ISDTContents {
private final CTP paragraph;
protected IBody part;
/** For access to the document's hyperlink, comments, tables etc */
protected XWPFDocument document;
protected List<XWPFRun> runs;
protected List<IRunElement> iruns;
private StringBuffer footnoteText = new StringBuffer();
@ -82,6 +84,7 @@ public class XWPFParagraph implements IBodyElement {
// Build up the character runs
runs = new ArrayList<XWPFRun>();
iruns = new ArrayList<IRunElement>();
buildRunsInOrderFromXml(paragraph);
// Look for bits associated with the runs
@ -96,7 +99,7 @@ public class XWPFParagraph implements IBodyElement {
XmlObject o = c.getObject();
if(o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef)o;
footnoteText.append("[").append(ftn.getId()).append(": ");
footnoteText.append(" [").append(ftn.getId()).append(": ");
XWPFFootnote footnote =
ftn.getDomNode().getLocalName().equals("footnoteReference") ?
document.getFootnoteByID(ftn.getId().intValue()) :
@ -111,7 +114,7 @@ public class XWPFParagraph implements IBodyElement {
footnoteText.append(p.getText());
}
footnoteText.append("]");
footnoteText.append("] ");
}
}
c.dispose();
@ -129,30 +132,40 @@ public class XWPFParagraph implements IBodyElement {
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTR) {
runs.add(new XWPFRun((CTR) o, this));
}
if (o instanceof CTHyperlink) {
CTHyperlink link = (CTHyperlink) o;
for (CTR r : link.getRList()) {
runs.add(new XWPFHyperlinkRun(link, r, this));
}
}
if (o instanceof CTSdtRun) {
CTSdtContentRun run = ((CTSdtRun) o).getSdtContent();
for (CTR r : run.getRList()) {
runs.add(new XWPFRun(r, this));
}
}
if (o instanceof CTRunTrackChange) {
for (CTR r : ((CTRunTrackChange) o).getRList()) {
runs.add(new XWPFRun(r, this));
}
}
if (o instanceof CTSimpleField) {
for (CTR r : ((CTSimpleField) o).getRList()) {
runs.add(new XWPFRun(r, this));
}
}
XWPFRun r = new XWPFRun((CTR) o, this);
runs.add(r);
iruns.add(r);
}
if (o instanceof CTHyperlink) {
CTHyperlink link = (CTHyperlink) o;
for (CTR r : link.getRList()) {
XWPFHyperlinkRun hr = new XWPFHyperlinkRun(link, r, this);
runs.add(hr);
iruns.add(hr);
}
}
if (o instanceof CTSdtBlock) {
XWPFSDT cc = new XWPFSDT((CTSdtBlock) o, part);
iruns.add(cc);
}
if (o instanceof CTSdtRun) {
XWPFSDT cc = new XWPFSDT((CTSdtRun) o, part);
iruns.add(cc);
}
if (o instanceof CTRunTrackChange) {
for (CTR r : ((CTRunTrackChange) o).getRList()) {
XWPFRun cr = new XWPFRun(r, this);
runs.add(cr);
iruns.add(cr);
}
}
if (o instanceof CTSimpleField) {
for (CTR r : ((CTSimpleField) o).getRList()) {
XWPFRun cr = new XWPFRun(r, this);
runs.add(cr);
iruns.add(cr);
}
}
if (o instanceof CTSmartTagRun) {
// Smart Tags can be nested many times.
// This implementation does not preserve the tagging information
@ -171,6 +184,14 @@ public class XWPFParagraph implements IBodyElement {
return Collections.unmodifiableList(runs);
}
/**
* Return literal runs and sdt/content control objects.
* @return List<IRunElement>
*/
public List<IRunElement> getIRuns() {
return Collections.unmodifiableList(iruns);
}
public boolean isEmpty(){
return !paragraph.getDomNode().hasChildNodes();
}
@ -181,12 +202,16 @@ public class XWPFParagraph implements IBodyElement {
/**
* Return the textual content of the paragraph, including text from pictures
* in it.
* and sdt elements in it.
*/
public String getText() {
StringBuffer out = new StringBuffer();
for(XWPFRun run : runs) {
out.append(run.toString());
for (IRunElement run : iruns) {
if (run instanceof XWPFSDT){
out.append(((XWPFSDT)run).getContent().getText());
} else {
out.append(run.toString());
}
}
out.append(footnoteText);
return out.toString();

View File

@ -67,7 +67,7 @@ public class XWPFPicture {
}
String blipId = blipProps.getBlip().getEmbed();
POIXMLDocumentPart part = run.getParagraph().getPart();
POIXMLDocumentPart part = run.getParent().getPart();
if (part != null)
{
POIXMLDocumentPart relatedPart = part.getRelationById(blipId);

View File

@ -74,24 +74,20 @@ import org.openxmlformats.schemas.drawingml.x2006.picture.CTPictureNonVisual;
/**
* XWPFRun object defines a region of text with a common set of properties
*
* @author Yegor Kozlov
* @author Gregg Morris (gregg dot morris at gmail dot com) - added getColor(), setColor()
*
*/
public class XWPFRun {
public class XWPFRun implements ISDTContents, IRunElement{
private CTR run;
private String pictureText;
private XWPFParagraph paragraph;
private IRunBody parent;
private List<XWPFPicture> pictures;
/**
* @param r the CTR bean which holds the run attributes
* @param p the parent paragraph
*/
public XWPFRun(CTR r, XWPFParagraph p) {
public XWPFRun(CTR r, IRunBody p) {
this.run = r;
this.paragraph = p;
this.parent = p;
/**
* reserve already occupied drawing ids, so reserving new ids later will
@ -143,6 +139,12 @@ public class XWPFRun {
}
}
}
/**
* @deprecated Use {@link XWPFRun#XWPFRun(CTR, IRunBody)}
*/
public XWPFRun(CTR r, XWPFParagraph p) {
this(r, (IRunBody)p);
}
private List<CTPicture> getCTPictures(XmlObject o) {
List<CTPicture> pictures = new ArrayList<CTPicture>();
@ -173,11 +175,20 @@ public class XWPFRun {
}
/**
* Get the currenty referenced paragraph object
* @return current paragraph
* Get the currently referenced paragraph/SDT object
* @return current parent
*/
public IRunBody getParent() {
return parent;
}
/**
* Get the currently referenced paragraph, or null if a SDT object
* @deprecated use {@link XWPFRun#getParent()} instead
*/
public XWPFParagraph getParagraph() {
return paragraph;
if (parent instanceof XWPFParagraph)
return (XWPFParagraph)parent;
return null;
}
/**
@ -185,8 +196,8 @@ public class XWPFRun {
* <code>null</code> if parent structure (paragraph > document) is not properly set.
*/
public XWPFDocument getDocument() {
if (paragraph != null) {
return paragraph.getDocument();
if (parent != null) {
return parent.getDocument();
}
return null;
}
@ -663,7 +674,7 @@ public class XWPFRun {
*/
public XWPFPicture addPicture(InputStream pictureData, int pictureType, String filename, int width, int height)
throws InvalidFormatException, IOException {
XWPFDocument doc = paragraph.document;
XWPFDocument doc = parent.getDocument();
// Add the picture + relationship
String relationId = doc.addPictureData(pictureData, pictureType);
@ -691,7 +702,7 @@ public class XWPFRun {
inline.setDistL(0);
CTNonVisualDrawingProps docPr = inline.addNewDocPr();
long id = getParagraph().document.getDrawingIdManager().reserveNew();
long id = getParent().getDocument().getDrawingIdManager().reserveNew();
docPr.setId(id);
/* This name is not visible in Word 2010 anywhere. */
docPr.setName("Drawing " + id);

View File

@ -0,0 +1,110 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import java.util.List;
import org.apache.poi.POIXMLDocumentPart;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
/**
* Experimental class to offer rudimentary read-only processing of
* of StructuredDocumentTags/ContentControl
*
*
*
* WARNING - APIs expected to change rapidly
*
*/
public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement {
private final String title;
private final String tag;
private final XWPFSDTContent content;
private final IBody part;
public XWPFSDT(CTSdtRun sdtRun, IBody part){
this.part = part;
this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this);
CTSdtPr pr = sdtRun.getSdtPr();
List<CTString> aliases = pr.getAliasList();
if (aliases != null && aliases.size() > 0){
title = aliases.get(0).getVal();
} else {
title = "";
}
@SuppressWarnings("deprecation")
CTString[] array = pr.getTagArray();
if (array != null && array.length > 0){
tag = array[0].getVal();
} else {
tag = "";
}
}
public XWPFSDT(CTSdtBlock block, IBody part){
this.part = part;
this.content = new XWPFSDTContent( block.getSdtContent(), part, this);
CTSdtPr pr = block.getSdtPr();
List<CTString> aliases = pr.getAliasList();
if (aliases != null && aliases.size() > 0){
title = aliases.get(0).getVal();
} else {
title = "";
}
@SuppressWarnings("deprecation")
CTString[] array = pr.getTagArray();
if (array != null && array.length > 0){
tag = array[0].getVal();
} else {
tag = "";
}
}
public String getTitle(){
return title;
}
public String getTag(){
return tag;
}
public XWPFSDTContent getContent(){
return content;
}
public IBody getBody() {
// TODO Auto-generated method stub
return null;
}
public POIXMLDocumentPart getPart() {
return part.getPart();
}
public BodyType getPartType() {
return BodyType.CONTENTCONTROL;
}
public BodyElementType getElementType() {
return BodyElementType.CONTENTCONTROL;
}
public XWPFDocument getDocument() {
return part.getXWPFDocument();
}
}

View File

@ -0,0 +1,107 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import java.util.ArrayList;
import java.util.List;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
/**
* Experimental class to offer rudimentary read-only processing of
* of the contentblock of an SDT/ContentControl.
*
*
*
* WARNING - APIs expected to change rapidly
*
*/
public class XWPFSDTContent {
// private final IBody part;
// private final XWPFDocument document;
private List<XWPFParagraph> paragraphs = new ArrayList<XWPFParagraph>();
private List<XWPFTable> tables = new ArrayList<XWPFTable>();
private List<XWPFRun> runs = new ArrayList<XWPFRun>();
private List<XWPFSDT> contentControls = new ArrayList<XWPFSDT>();
private List<ISDTContents> bodyElements = new ArrayList<ISDTContents>();
public XWPFSDTContent(CTSdtContentRun sdtRun, IBody part, IRunBody parent){
for (CTR ctr : sdtRun.getRList()){
XWPFRun run = new XWPFRun((CTR) ctr, parent);
runs.add(run);
bodyElements.add(run);
}
}
public XWPFSDTContent(CTSdtContentBlock block, IBody part, IRunBody parent){
XmlCursor cursor = block.newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject o = cursor.getObject();
if (o instanceof CTP) {
XWPFParagraph p = new XWPFParagraph((CTP) o, part);
bodyElements.add(p);
paragraphs.add(p);
} else if (o instanceof CTTbl) {
XWPFTable t = new XWPFTable((CTTbl) o, part);
bodyElements.add(t);
tables.add(t);
} else if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT(((CTSdtBlock)o), part);
bodyElements.add(c);
contentControls.add(c);
} else if (o instanceof CTR) {
XWPFRun run = new XWPFRun((CTR) o, parent);
runs.add(run);
bodyElements.add(run);
}
}
}
public String getText(){
StringBuilder text = new StringBuilder();
for (int i = 0; i < bodyElements.size(); i++){
Object o = bodyElements.get(i);
if (o instanceof XWPFParagraph){
text.append(((XWPFParagraph)o).getText());
} else if (o instanceof XWPFTable){
text.append(((XWPFTable)o).getText());
} else if (o instanceof XWPFSDT){
text.append(((XWPFSDT)o).getContent().getText());
} else if (o instanceof XWPFRun){
text.append(((XWPFRun)o).toString());
}
if (i < bodyElements.size()-1){
text.append("\n");
}
}
return text.toString();
}
public String toString(){
return getText();
}
}

View File

@ -43,7 +43,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTblWidth;
* <p>Specifies the contents of a table present in the document. A table is a set
* of paragraphs (and other block-level content) arranged in rows and columns.</p>
*/
public class XWPFTable implements IBodyElement {
public class XWPFTable implements IBodyElement, ISDTContents {
protected StringBuffer text = new StringBuffer();
private CTTbl ctTbl;
protected List<XWPFTableRow> tableRows;

View File

@ -28,6 +28,8 @@ import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTShd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
@ -97,6 +99,15 @@ public class XWPFTableCell implements IBody {
tables.add(t);
bodyElements.add(t);
}
if (o instanceof CTSdtBlock){
XWPFSDT c = new XWPFSDT((CTSdtBlock)o, this);
bodyElements.add(c);
}
if (o instanceof CTSdtRun){
XWPFSDT c = new XWPFSDT((CTSdtRun)o, this);
System.out.println(c.getContent().getText());
bodyElements.add(c);
}
}
cursor.dispose();
}
@ -407,6 +418,48 @@ public class XWPFTableCell implements IBody {
return text.toString();
}
/**
* extracts all text recursively through embedded tables and embedded SDTs
*/
public String getTextRecursively(){
StringBuffer text = new StringBuffer();
for (int i = 0; i < bodyElements.size(); i++){
boolean isLast = (i== bodyElements.size()-1)? true : false;
appendBodyElementText(text, bodyElements.get(i), isLast);
}
return text.toString();
}
private void appendBodyElementText(StringBuffer text, IBodyElement e, boolean isLast){
if (e instanceof XWPFParagraph){
text.append(((XWPFParagraph)e).getText());
if (isLast == false){
text.append('\t');
}
} else if (e instanceof XWPFTable){
XWPFTable eTable = (XWPFTable)e;
for (XWPFTableRow row : eTable.getRows()){
for (XWPFTableCell cell : row.getTableCells()){
List<IBodyElement> localBodyElements = cell.getBodyElements();
for (int i = 0; i < localBodyElements.size(); i++){
boolean localIsLast = (i== localBodyElements.size()-1)? true : false;
appendBodyElementText(text, localBodyElements.get(i), localIsLast);
}
}
}
if (isLast == false){
text.append('\n');
}
} else if (e instanceof XWPFSDT){
text.append(((XWPFSDT)e).getContent().getText());
if (isLast == false){
text.append('\t');
}
}
}
/**
* get the TableCell which belongs to the TableCell

View File

@ -132,6 +132,9 @@ public class XWPFTableRow {
for (CTTc tableCell : ctRow.getTcList()) {
cells.add(new XWPFTableCell(tableCell, this, table.getBody()));
}
//TODO: it is possible to have an SDT that contains a cell in within a row
//need to modify this code so that it pulls out SDT wrappers around cells, too.
this.tableCells = cells;
}
return tableCells;

View File

@ -80,10 +80,10 @@ public class TestXWPFWordExtractor extends TestCase {
" \n(V) ILLUSTRATIVE CASES\n\n"
));
assertTrue(text.contains(
"As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
"As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n"
));
assertTrue(text.endsWith(
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n"
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n"
));
// Check number of paragraphs
@ -317,4 +317,39 @@ public class TestXWPFWordExtractor extends TestCase {
extractor.close();
}
/**
* Test for basic extraction of SDT content
* @throws IOException
*/
public void testSimpleControlContent() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
String[] targs = new String[]{
"header_rich_text",
"rich_text",
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table",
"plain_text_no_newlines",
"plain_text_with_newlines1\nplain_text_with_newlines2\n",
"watermelon\n",
"dirt\n",
"4/16/2013\n",
"rich_text_in_paragraph_in_cell",
"footer_rich_text",
"footnote_sdt",
"endnote_sdt"
};
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
String s = ex.getText().toLowerCase();
int hits = 0;
for (String targ : targs){
boolean hit = false;
if (s.indexOf(targ) > -1){
hit = true;
hits++;
}
assertEquals("controlled content loading-"+targ, true, hit);
}
assertEquals("controlled content loading hit count", targs.length, hits);
}
}

View File

@ -0,0 +1,158 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.poi.xwpf.XWPFTestDataSamples;
public final class TestXWPFSDT extends TestCase {
/**
* Test simple tag and title extraction from SDT
* @throws Exception
*/
public void testTagTitle() throws Exception {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
String tag = null;
String title= null;
List<XWPFSDT> sdts = extractAllSDTs(doc);
for (XWPFSDT sdt :sdts){
if (sdt.getContent().toString().equals("Rich_text")){
tag = "MyTag";
title = "MyTitle";
break;
}
}
// TODO Fix footnotes issues then enable
// assertEquals("controls size", 12, sdts.size());
assertEquals("tag", "MyTag", tag);
assertEquals("title", "MyTitle", title);
}
public void testGetSDTs() throws Exception{
String[] contents = new String[]{
"header_rich_text",
"Rich_text",
"Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table",
"Plain_text_no_newlines",
"Plain_text_with_newlines1\nplain_text_with_newlines2",
"Watermelon",
"Dirt",
"4/16/2013",
"rich_text_in_paragraph_in_cell",
"Footer_rich_text",
"Footnote_sdt",
"Endnote_sdt"
};
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
List<XWPFSDT> sdts = extractAllSDTs(doc);
// TODO Fix footnotes issue
/*
assertEquals("number of sdts", contents.length, sdts.size());
for (int i = 0; i < sdts.size(); i++){//contents.length; i++){
XWPFSDT sdt = sdts.get(i);
assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString());
}
*/
}
public void testFailureToGetSDTAsCell() throws Exception{
/**
* The current code fails to extract an sdt if it comprises/is the parent
* of a cell in a table.
*/
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
List<XWPFSDT> sdts = extractAllSDTs(doc);
boolean found = false;
for (XWPFSDT sdt : sdts){
if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){
found = true;
}
}
assertEquals("SDT as cell known failure", false, found);
}
private List<XWPFSDT> extractAllSDTs(XWPFDocument doc){
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
List<XWPFHeader> headers = doc.getHeaderList();
for (XWPFHeader header : headers){
sdts.addAll(extractSDTsFromBodyElements(header.getBodyElements()));
}
sdts.addAll(extractSDTsFromBodyElements(doc.getBodyElements()));
List<XWPFFooter> footers = doc.getFooterList();
for (XWPFFooter footer : footers){
sdts.addAll(extractSDTsFromBodyElements(footer.getBodyElements()));
}
for (XWPFFootnote footnote : doc.getFootnotes()){
sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements()));
}
for (Map.Entry<Integer, XWPFFootnote> e : doc.endnotes.entrySet()){
sdts.addAll(extractSDTsFromBodyElements(e.getValue().getBodyElements()));
}
return sdts;
}
private List<XWPFSDT> extractSDTsFromBodyElements(List<IBodyElement> elements){
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
for (IBodyElement e : elements){
if (e instanceof XWPFSDT){
XWPFSDT sdt = (XWPFSDT)e;
sdts.add(sdt);
} else if (e instanceof XWPFParagraph){
XWPFParagraph p = (XWPFParagraph)e;
for (IRunElement e2 : p.getIRuns()){
if (e2 instanceof XWPFSDT){
XWPFSDT sdt = (XWPFSDT)e2;
sdts.add(sdt);
}
}
} else if (e instanceof XWPFTable){
XWPFTable table = (XWPFTable)e;
sdts.addAll(extractSDTsFromTable(table));
}
}
return sdts;
}
private List<XWPFSDT> extractSDTsFromTable(XWPFTable table){
List<XWPFSDT> sdts = new ArrayList<XWPFSDT>();
for (XWPFTableRow r : table.getRows()){
for (XWPFTableCell c : r.getTableCells()){
sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements()));
}
}
return sdts;
}
}