XWPF paragraph improvements - Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers. Also improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@996899 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-09-14 13:46:22 +00:00
parent 77f5742c75
commit 7fad16fd1b
10 changed files with 224 additions and 120 deletions

View File

@ -34,6 +34,8 @@
<changes>
<release version="3.7-beta3" date="2010-??-??">
<action dev="poi-developers" type="fix">Improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun</action>
<action dev="poi-developers" type="fix">Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers</action>
<action dev="poi-developers" type="add">Add a getBodyElements() method to XWPF IBody, to make access to embedded paragraphs and tables easier</action>
<action dev="poi-developers" type="add">More XSLFRelation entries for common .pptx file parts</action>
<action dev="poi-developers" type="fix">49872 - avoid exception in XSSFFormulaEvaluator.evaluateInCell when evaluating shared formulas</action>

View File

@ -29,8 +29,11 @@ import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
@ -103,9 +106,28 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
extractHeaders(text, headerFooterPolicy);
}
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
text.append(decorator.getText()).append('\n');
// Do the paragraph text
for(XWPFRun run : paragraph.getRuns()) {
text.append(run.toString());
if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
if(link != null)
text.append(" <" + link.getURL() + ">");
}
}
// Add comments
XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
text.append(decorator.getCommentText()).append('\n');
// Do endnotes, footnotes and pictures
for(String str : new String[] {
paragraph.getFootnoteText(), paragraph.getPictureText()
}) {
if(str != null && str.length() > 0) {
text.append(str + "\n");
}
}
if (ctSectPr!=null) {
extractFooters(text, headerFooterPolicy);

View File

@ -46,6 +46,10 @@ public class XWPFCommentsDecorator extends XWPFParagraphDecorator {
}
}
public String getCommentText() {
return commentText.toString();
}
public String getText() {
return super.getText() + commentText;
}

View File

@ -19,15 +19,18 @@ package org.apache.poi.xwpf.model;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/**
* Decorator class for XWPFParagraph allowing to add hyperlinks
* found in paragraph to its text.
*
* TODO - add the hyperlink text in the right place, and not just
* at the end
* Note - adds the hyperlink at the end, not in the right place...
*
* @deprecated Use {@link XWPFHyperlinkRun} instead
*/
@Deprecated
public class XWPFHyperlinkDecorator extends XWPFParagraphDecorator {
private StringBuffer hyperlinkText;

View File

@ -0,0 +1,64 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
/**
* A run of text with a Hyperlink applied to it.
* Any given Hyperlink may be made up of multiple of these.
*/
public class XWPFHyperlinkRun extends XWPFRun
{
private CTHyperlink hyperlink;
public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, XWPFParagraph p) {
super(run, p);
this.hyperlink = hyperlink;
}
public CTHyperlink getCTHyperlink() {
return hyperlink;
}
public String getAnchor() {
return hyperlink.getAnchor();
}
/**
* Returns the ID of the hyperlink, if one is set.
*/
public String getHyperlinkId() {
return hyperlink.getId();
}
public void setHyperlinkId(String id) {
hyperlink.setId(id);
}
/**
* If this Hyperlink is an external reference hyperlink,
* return the object for it.
*/
public XWPFHyperlink getHyperlink(XWPFDocument document) {
String id = getHyperlinkId();
if(id == null)
return null;
return document.getHyperlinkByID(id);
}
}

View File

@ -20,21 +20,19 @@ import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Arrays;
import org.apache.poi.util.Internal;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPBdr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProofErr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
@ -66,10 +64,6 @@ public class XWPFParagraph implements IBodyElement{
protected XWPFDocument document;
protected List<XWPFRun> runs;
/**
* TODO - replace with RichText String
*/
private StringBuffer text = new StringBuffer();
private StringBuffer pictureText = new StringBuffer();
private StringBuffer footnoteText = new StringBuffer();
@ -91,79 +85,53 @@ public class XWPFParagraph implements IBodyElement{
}
runs = new ArrayList<XWPFRun>();
if (prgrph.getRList().size() > 0) {
for(CTR ctRun : prgrph.getRList()) {
runs.add(new XWPFRun(ctRun, this));
}
}
if (!isEmpty()) {
readNewText();
}
}
protected String readNewText() {
StringBuffer text = new StringBuffer();
// All the runs to loop over
// TODO - replace this with some sort of XPath expression
// to directly find all the CTRs, in the right order
ArrayList<CTR> rs = new ArrayList<CTR>();
rs.addAll( paragraph.getRList() );
for (CTSdtRun sdt : paragraph.getSdtList()) {
CTSdtContentRun run = sdt.getSdtContent();
rs.addAll( run.getRList() );
}
for (CTRunTrackChange c : paragraph.getDelList()) {
rs.addAll( c.getRList() );
}
for (CTRunTrackChange c : paragraph.getInsList()) {
rs.addAll( c.getRList() );
}
for (CTSimpleField f : paragraph.getFldSimpleList()) {
rs.addAll( f.getRList() );
}
// Get text of the paragraph
for (int j = 0; j < rs.size(); j++) {
// Grab the text and tabs of the paragraph
// Do so in a way that preserves the ordering
XmlCursor c = rs.get(j).newCursor();
c.selectPath("./*");
// Get all our child nodes in order, and process them
// into XWPFRuns where we can
XmlCursor c = paragraph.newCursor();
c.selectPath("child::*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTText) {
String tagName = o.getDomNode().getNodeName();
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
// come up as instances of CTText, but we don't want them
// in the normal text output
if (!"w:instrText".equals(tagName)) {
text.append(((CTText) o).getStringValue());
if(o instanceof CTR) {
runs.add(new XWPFRun((CTR)o, this));
}
if(o instanceof CTHyperlink) {
CTHyperlink link = (CTHyperlink)o;
for(CTR r : link.getRList()) {
runs.add(new XWPFHyperlinkRun(link, r, this));
}
}
if (o instanceof CTPTab) {
text.append("\t");
}
if (o instanceof CTEmpty) {
// Some inline text elements get returned not as
// themselves, but as CTEmpty, owing to some odd
// definitions around line 5642 of the XSDs
String tagName = o.getDomNode().getNodeName();
if ("w:tab".equals(tagName)) {
text.append("\t");
}
if ("w:cr".equals(tagName)) {
text.append("\n");
if(o instanceof CTSdtRun) {
CTSdtContentRun run = ((CTSdtRun)o).getSdtContent();
for(CTR r : run.getRList()) {
runs.add(new XWPFRun(r, this));
}
}
if(o instanceof CTRunTrackChange) {
for(CTR r : ((CTRunTrackChange)o).getRList()) {
runs.add(new XWPFRun(r, this));
}
}
if(o instanceof CTSimpleField) {
for(CTR r : ((CTSimpleField)o).getRList()) {
runs.add(new XWPFRun(r, this));
}
}
}
// Look for bits associated with the runs
for(XWPFRun run : runs) {
CTR r = run.getCTR();
// Check for bits that only apply when
// attached to a core document
if(document != null) {
//got a reference to a footnote
if (o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef) o;
c = r.newCursor();
c.selectPath("child::*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if(o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef)o;
footnoteText.append("[").append(ftn.getId()).append(": ");
XWPFFootnote footnote =
ftn.getDomNode().getLocalName().equals("footnoteReference") ?
@ -186,7 +154,7 @@ public class XWPFParagraph implements IBodyElement{
// Loop over pictures inside our
// paragraph, looking for text in them
for(CTPicture pict : rs.get(j).getPictList()) {
for(CTPicture pict : r.getPictList()) {
XmlObject[] t = pict
.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
for (int m = 0; m < t.length; m++) {
@ -200,9 +168,6 @@ public class XWPFParagraph implements IBodyElement{
}
}
}
this.text = text;
return text.toString();
}
@Internal
@ -228,7 +193,10 @@ public class XWPFParagraph implements IBodyElement{
*/
public String getText() {
StringBuffer out = new StringBuffer();
out.append(text).append(footnoteText).append(pictureText);
for(XWPFRun run : runs) {
out.append(run.toString());
}
out.append(footnoteText).append(pictureText);
return out.toString();
}
@ -282,7 +250,11 @@ public class XWPFParagraph implements IBodyElement{
* paragraph
*/
public String getParagraphText() {
return text.toString();
StringBuffer out = new StringBuffer();
for(XWPFRun run : runs) {
out.append(run.toString());
}
return out.toString();
}
/**
@ -1143,9 +1115,6 @@ public class XWPFParagraph implements IBodyElement{
pos = paragraph.getRList().size();
paragraph.addNewR();
paragraph.setRArray(pos, run);
for (CTText ctText: paragraph.getRArray(pos).getTList()) {
this.text.append(ctText.getStringValue());
}
}
/**

View File

@ -19,12 +19,15 @@ package org.apache.poi.xwpf.usermodel;
import java.math.BigInteger;
import org.apache.poi.util.Internal;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlString;
import org.apache.xmlbeans.XmlCursor;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFonts;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHpsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
@ -492,4 +495,45 @@ public class XWPFRun {
}
}
/**
* Returns the string version of the text, with tabs and
* carriage returns in place of their xml equivalents.
*/
public String toString() {
StringBuffer text = new StringBuffer();
// Grab the text and tabs of the text run
// Do so in a way that preserves the ordering
XmlCursor c = run.newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTText) {
String tagName = o.getDomNode().getNodeName();
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
// come up as instances of CTText, but we don't want them
// in the normal text output
if (!"w:instrText".equals(tagName)) {
text.append(((CTText) o).getStringValue());
}
}
if (o instanceof CTPTab) {
text.append("\t");
}
if (o instanceof CTEmpty) {
// Some inline text elements get returned not as
// themselves, but as CTEmpty, owing to some odd
// definitions around line 5642 of the XSDs
String tagName = o.getDomNode().getNodeName();
if ("w:tab".equals(tagName)) {
text.append("\t");
}
if ("w:cr".equals(tagName)) {
text.append("\n");
}
}
}
return text.toString();
}
}

View File

@ -327,7 +327,7 @@ public class XWPFTableCell implements IBody {
public String getText(){
StringBuffer text = new StringBuffer();
for (XWPFParagraph p : paragraphs) {
text.append(p.readNewText());
text.append(p.getText());
}
return text.toString();
}

View File

@ -96,22 +96,18 @@ public class TestXWPFWordExtractor extends TestCase {
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
"Back to normal\nWe have a hyperlink here, and another.\n",
extractor.getText()
);
// One hyperlink is a real one, one is just to the top of page
extractor.setFetchHyperlinks(true);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
"Back to normal\nWe have a hyperlink <http://poi.apache.org/> here, and another.\n",
extractor.getText()
);
}

View File

@ -144,7 +144,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
policy = oddEven.getHeaderFooterPolicy();
assertEquals(
"[]ODD Page Header text\n\n",
"[ODD Page Header text]\n\n",
policy.getDefaultHeader().getText()
);
assertEquals(