XWPF paragraph improvements - Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers. Also improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@996899 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
77f5742c75
commit
7fad16fd1b
@ -34,6 +34,8 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.7-beta3" date="2010-??-??">
|
||||
<action dev="poi-developers" type="fix">Improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun</action>
|
||||
<action dev="poi-developers" type="fix">Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers</action>
|
||||
<action dev="poi-developers" type="add">Add a getBodyElements() method to XWPF IBody, to make access to embedded paragraphs and tables easier</action>
|
||||
<action dev="poi-developers" type="add">More XSLFRelation entries for common .pptx file parts</action>
|
||||
<action dev="poi-developers" type="fix">49872 - avoid exception in XSSFFormulaEvaluator.evaluateInCell when evaluating shared formulas</action>
|
||||
|
@ -29,8 +29,11 @@ import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
|
||||
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRun;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
|
||||
@ -103,9 +106,28 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||
extractHeaders(text, headerFooterPolicy);
|
||||
}
|
||||
|
||||
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
||||
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
|
||||
text.append(decorator.getText()).append('\n');
|
||||
// Do the paragraph text
|
||||
for(XWPFRun run : paragraph.getRuns()) {
|
||||
text.append(run.toString());
|
||||
if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
|
||||
XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
|
||||
if(link != null)
|
||||
text.append(" <" + link.getURL() + ">");
|
||||
}
|
||||
}
|
||||
|
||||
// Add comments
|
||||
XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null);
|
||||
text.append(decorator.getCommentText()).append('\n');
|
||||
|
||||
// Do endnotes, footnotes and pictures
|
||||
for(String str : new String[] {
|
||||
paragraph.getFootnoteText(), paragraph.getPictureText()
|
||||
}) {
|
||||
if(str != null && str.length() > 0) {
|
||||
text.append(str + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (ctSectPr!=null) {
|
||||
extractFooters(text, headerFooterPolicy);
|
||||
|
@ -46,6 +46,10 @@ public class XWPFCommentsDecorator extends XWPFParagraphDecorator {
|
||||
}
|
||||
}
|
||||
|
||||
public String getCommentText() {
|
||||
return commentText.toString();
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return super.getText() + commentText;
|
||||
}
|
||||
|
@ -19,15 +19,18 @@ package org.apache.poi.xwpf.model;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
|
||||
/**
|
||||
* Decorator class for XWPFParagraph allowing to add hyperlinks
|
||||
* found in paragraph to its text.
|
||||
*
|
||||
* TODO - add the hyperlink text in the right place, and not just
|
||||
* at the end
|
||||
* Note - adds the hyperlink at the end, not in the right place...
|
||||
*
|
||||
* @deprecated Use {@link XWPFHyperlinkRun} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public class XWPFHyperlinkDecorator extends XWPFParagraphDecorator {
|
||||
private StringBuffer hyperlinkText;
|
||||
|
||||
|
@ -0,0 +1,64 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
|
||||
/**
|
||||
* A run of text with a Hyperlink applied to it.
|
||||
* Any given Hyperlink may be made up of multiple of these.
|
||||
*/
|
||||
public class XWPFHyperlinkRun extends XWPFRun
|
||||
{
|
||||
private CTHyperlink hyperlink;
|
||||
|
||||
public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, XWPFParagraph p) {
|
||||
super(run, p);
|
||||
this.hyperlink = hyperlink;
|
||||
}
|
||||
|
||||
public CTHyperlink getCTHyperlink() {
|
||||
return hyperlink;
|
||||
}
|
||||
|
||||
public String getAnchor() {
|
||||
return hyperlink.getAnchor();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the ID of the hyperlink, if one is set.
|
||||
*/
|
||||
public String getHyperlinkId() {
|
||||
return hyperlink.getId();
|
||||
}
|
||||
public void setHyperlinkId(String id) {
|
||||
hyperlink.setId(id);
|
||||
}
|
||||
|
||||
/**
|
||||
* If this Hyperlink is an external reference hyperlink,
|
||||
* return the object for it.
|
||||
*/
|
||||
public XWPFHyperlink getHyperlink(XWPFDocument document) {
|
||||
String id = getHyperlinkId();
|
||||
if(id == null)
|
||||
return null;
|
||||
|
||||
return document.getHyperlinkByID(id);
|
||||
}
|
||||
}
|
@ -20,21 +20,19 @@ import java.math.BigInteger;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.xmlbeans.XmlCursor;
|
||||
import org.apache.xmlbeans.XmlObject;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPBdr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProofErr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
@ -66,10 +64,6 @@ public class XWPFParagraph implements IBodyElement{
|
||||
protected XWPFDocument document;
|
||||
protected List<XWPFRun> runs;
|
||||
|
||||
/**
|
||||
* TODO - replace with RichText String
|
||||
*/
|
||||
private StringBuffer text = new StringBuffer();
|
||||
private StringBuffer pictureText = new StringBuffer();
|
||||
private StringBuffer footnoteText = new StringBuffer();
|
||||
|
||||
@ -91,79 +85,53 @@ public class XWPFParagraph implements IBodyElement{
|
||||
}
|
||||
|
||||
runs = new ArrayList<XWPFRun>();
|
||||
if (prgrph.getRList().size() > 0) {
|
||||
for(CTR ctRun : prgrph.getRList()) {
|
||||
runs.add(new XWPFRun(ctRun, this));
|
||||
}
|
||||
}
|
||||
|
||||
if (!isEmpty()) {
|
||||
readNewText();
|
||||
}
|
||||
}
|
||||
|
||||
protected String readNewText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
// All the runs to loop over
|
||||
// TODO - replace this with some sort of XPath expression
|
||||
// to directly find all the CTRs, in the right order
|
||||
ArrayList<CTR> rs = new ArrayList<CTR>();
|
||||
rs.addAll( paragraph.getRList() );
|
||||
|
||||
for (CTSdtRun sdt : paragraph.getSdtList()) {
|
||||
CTSdtContentRun run = sdt.getSdtContent();
|
||||
rs.addAll( run.getRList() );
|
||||
}
|
||||
for (CTRunTrackChange c : paragraph.getDelList()) {
|
||||
rs.addAll( c.getRList() );
|
||||
}
|
||||
for (CTRunTrackChange c : paragraph.getInsList()) {
|
||||
rs.addAll( c.getRList() );
|
||||
}
|
||||
for (CTSimpleField f : paragraph.getFldSimpleList()) {
|
||||
rs.addAll( f.getRList() );
|
||||
}
|
||||
|
||||
// Get text of the paragraph
|
||||
for (int j = 0; j < rs.size(); j++) {
|
||||
// Grab the text and tabs of the paragraph
|
||||
// Do so in a way that preserves the ordering
|
||||
XmlCursor c = rs.get(j).newCursor();
|
||||
c.selectPath("./*");
|
||||
// Get all our child nodes in order, and process them
|
||||
// into XWPFRuns where we can
|
||||
XmlCursor c = paragraph.newCursor();
|
||||
c.selectPath("child::*");
|
||||
while (c.toNextSelection()) {
|
||||
XmlObject o = c.getObject();
|
||||
if (o instanceof CTText) {
|
||||
String tagName = o.getDomNode().getNodeName();
|
||||
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
|
||||
// come up as instances of CTText, but we don't want them
|
||||
// in the normal text output
|
||||
if (!"w:instrText".equals(tagName)) {
|
||||
text.append(((CTText) o).getStringValue());
|
||||
if(o instanceof CTR) {
|
||||
runs.add(new XWPFRun((CTR)o, this));
|
||||
}
|
||||
if(o instanceof CTHyperlink) {
|
||||
CTHyperlink link = (CTHyperlink)o;
|
||||
for(CTR r : link.getRList()) {
|
||||
runs.add(new XWPFHyperlinkRun(link, r, this));
|
||||
}
|
||||
}
|
||||
if (o instanceof CTPTab) {
|
||||
text.append("\t");
|
||||
}
|
||||
if (o instanceof CTEmpty) {
|
||||
// Some inline text elements get returned not as
|
||||
// themselves, but as CTEmpty, owing to some odd
|
||||
// definitions around line 5642 of the XSDs
|
||||
String tagName = o.getDomNode().getNodeName();
|
||||
if ("w:tab".equals(tagName)) {
|
||||
text.append("\t");
|
||||
}
|
||||
if ("w:cr".equals(tagName)) {
|
||||
text.append("\n");
|
||||
if(o instanceof CTSdtRun) {
|
||||
CTSdtContentRun run = ((CTSdtRun)o).getSdtContent();
|
||||
for(CTR r : run.getRList()) {
|
||||
runs.add(new XWPFRun(r, this));
|
||||
}
|
||||
}
|
||||
if(o instanceof CTRunTrackChange) {
|
||||
for(CTR r : ((CTRunTrackChange)o).getRList()) {
|
||||
runs.add(new XWPFRun(r, this));
|
||||
}
|
||||
}
|
||||
if(o instanceof CTSimpleField) {
|
||||
for(CTR r : ((CTSimpleField)o).getRList()) {
|
||||
runs.add(new XWPFRun(r, this));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Look for bits associated with the runs
|
||||
for(XWPFRun run : runs) {
|
||||
CTR r = run.getCTR();
|
||||
|
||||
// Check for bits that only apply when
|
||||
// attached to a core document
|
||||
if(document != null) {
|
||||
//got a reference to a footnote
|
||||
if (o instanceof CTFtnEdnRef) {
|
||||
CTFtnEdnRef ftn = (CTFtnEdnRef) o;
|
||||
c = r.newCursor();
|
||||
c.selectPath("child::*");
|
||||
while (c.toNextSelection()) {
|
||||
XmlObject o = c.getObject();
|
||||
if(o instanceof CTFtnEdnRef) {
|
||||
CTFtnEdnRef ftn = (CTFtnEdnRef)o;
|
||||
footnoteText.append("[").append(ftn.getId()).append(": ");
|
||||
XWPFFootnote footnote =
|
||||
ftn.getDomNode().getLocalName().equals("footnoteReference") ?
|
||||
@ -186,7 +154,7 @@ public class XWPFParagraph implements IBodyElement{
|
||||
|
||||
// Loop over pictures inside our
|
||||
// paragraph, looking for text in them
|
||||
for(CTPicture pict : rs.get(j).getPictList()) {
|
||||
for(CTPicture pict : r.getPictList()) {
|
||||
XmlObject[] t = pict
|
||||
.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
|
||||
for (int m = 0; m < t.length; m++) {
|
||||
@ -200,9 +168,6 @@ public class XWPFParagraph implements IBodyElement{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.text = text;
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
@Internal
|
||||
@ -228,7 +193,10 @@ public class XWPFParagraph implements IBodyElement{
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer out = new StringBuffer();
|
||||
out.append(text).append(footnoteText).append(pictureText);
|
||||
for(XWPFRun run : runs) {
|
||||
out.append(run.toString());
|
||||
}
|
||||
out.append(footnoteText).append(pictureText);
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
@ -282,7 +250,11 @@ public class XWPFParagraph implements IBodyElement{
|
||||
* paragraph
|
||||
*/
|
||||
public String getParagraphText() {
|
||||
return text.toString();
|
||||
StringBuffer out = new StringBuffer();
|
||||
for(XWPFRun run : runs) {
|
||||
out.append(run.toString());
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1143,9 +1115,6 @@ public class XWPFParagraph implements IBodyElement{
|
||||
pos = paragraph.getRList().size();
|
||||
paragraph.addNewR();
|
||||
paragraph.setRArray(pos, run);
|
||||
for (CTText ctText: paragraph.getRArray(pos).getTList()) {
|
||||
this.text.append(ctText.getStringValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -19,12 +19,15 @@ package org.apache.poi.xwpf.usermodel;
|
||||
import java.math.BigInteger;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.xmlbeans.XmlObject;
|
||||
import org.apache.xmlbeans.XmlString;
|
||||
import org.apache.xmlbeans.XmlCursor;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFonts;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHpsMeasure;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
|
||||
@ -492,4 +495,45 @@ public class XWPFRun {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string version of the text, with tabs and
|
||||
* carriage returns in place of their xml equivalents.
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
// Grab the text and tabs of the text run
|
||||
// Do so in a way that preserves the ordering
|
||||
XmlCursor c = run.newCursor();
|
||||
c.selectPath("./*");
|
||||
while (c.toNextSelection()) {
|
||||
XmlObject o = c.getObject();
|
||||
if (o instanceof CTText) {
|
||||
String tagName = o.getDomNode().getNodeName();
|
||||
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
|
||||
// come up as instances of CTText, but we don't want them
|
||||
// in the normal text output
|
||||
if (!"w:instrText".equals(tagName)) {
|
||||
text.append(((CTText) o).getStringValue());
|
||||
}
|
||||
}
|
||||
if (o instanceof CTPTab) {
|
||||
text.append("\t");
|
||||
}
|
||||
if (o instanceof CTEmpty) {
|
||||
// Some inline text elements get returned not as
|
||||
// themselves, but as CTEmpty, owing to some odd
|
||||
// definitions around line 5642 of the XSDs
|
||||
String tagName = o.getDomNode().getNodeName();
|
||||
if ("w:tab".equals(tagName)) {
|
||||
text.append("\t");
|
||||
}
|
||||
if ("w:cr".equals(tagName)) {
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
@ -327,7 +327,7 @@ public class XWPFTableCell implements IBody {
|
||||
public String getText(){
|
||||
StringBuffer text = new StringBuffer();
|
||||
for (XWPFParagraph p : paragraphs) {
|
||||
text.append(p.readNewText());
|
||||
text.append(p.getText());
|
||||
}
|
||||
return text.toString();
|
||||
}
|
||||
|
@ -96,22 +96,18 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
||||
|
||||
// Now check contents
|
||||
// TODO - fix once correctly handling contents
|
||||
extractor.setFetchHyperlinks(false);
|
||||
assertEquals(
|
||||
// "This is a test document\nThis bit is in bold and italic\n" +
|
||||
// "Back to normal\nWe have a hyperlink here, and another.\n",
|
||||
"This is a test document\nThis bit is in bold and italic\n" +
|
||||
"Back to normal\nWe have a here, and .hyperlinkanother\n",
|
||||
"Back to normal\nWe have a hyperlink here, and another.\n",
|
||||
extractor.getText()
|
||||
);
|
||||
|
||||
// One hyperlink is a real one, one is just to the top of page
|
||||
extractor.setFetchHyperlinks(true);
|
||||
assertEquals(
|
||||
// "This is a test document\nThis bit is in bold and italic\n" +
|
||||
// "Back to normal\nWe have a hyperlink here, and another.\n",
|
||||
"This is a test document\nThis bit is in bold and italic\n" +
|
||||
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
|
||||
"Back to normal\nWe have a hyperlink <http://poi.apache.org/> here, and another.\n",
|
||||
extractor.getText()
|
||||
);
|
||||
}
|
||||
|
@ -144,7 +144,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
|
||||
policy = oddEven.getHeaderFooterPolicy();
|
||||
|
||||
assertEquals(
|
||||
"[]ODD Page Header text\n\n",
|
||||
"[ODD Page Header text]\n\n",
|
||||
policy.getDefaultHeader().getText()
|
||||
);
|
||||
assertEquals(
|
||||
|
Loading…
Reference in New Issue
Block a user