diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index a0fc2a015..014a08565 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,8 @@ + Improve handling of Hyperlinks inside XWPFParagraph objects through XWPFHyperlinkRun + Make XWPFParagraph make more use of XWPFRun, and less on internal StringBuffers Add a getBodyElements() method to XWPF IBody, to make access to embedded paragraphs and tables easier More XSLFRelation entries for common .pptx file parts 49872 - avoid exception in XSSFFormulaEvaluator.evaluateInCell when evaluating shared formulas diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 9c159956d..c990776da 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -29,8 +29,11 @@ import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator; import org.apache.poi.xwpf.model.XWPFParagraphDecorator; import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFHyperlink; +import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; @@ -103,9 +106,28 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { extractHeaders(text, headerFooterPolicy); } - XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( - new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); - text.append(decorator.getText()).append('\n'); + // Do the paragraph text + for(XWPFRun run : paragraph.getRuns()) { + text.append(run.toString()); + if(run instanceof XWPFHyperlinkRun && fetchHyperlinks) { + XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document); + if(link != null) + text.append(" <" + link.getURL() + ">"); + } + } + + // Add comments + XWPFCommentsDecorator decorator = new XWPFCommentsDecorator(paragraph, null); + text.append(decorator.getCommentText()).append('\n'); + + // Do endnotes, footnotes and pictures + for(String str : new String[] { + paragraph.getFootnoteText(), paragraph.getPictureText() + }) { + if(str != null && str.length() > 0) { + text.append(str + "\n"); + } + } if (ctSectPr!=null) { extractFooters(text, headerFooterPolicy); diff --git a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFCommentsDecorator.java b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFCommentsDecorator.java index 7a528237a..be7bf0408 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFCommentsDecorator.java +++ b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFCommentsDecorator.java @@ -46,6 +46,10 @@ public class XWPFCommentsDecorator extends XWPFParagraphDecorator { } } + public String getCommentText() { + return commentText.toString(); + } + public String getText() { return super.getText() + commentText; } diff --git a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHyperlinkDecorator.java b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHyperlinkDecorator.java index 3ad4c492d..fb7d6ba93 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHyperlinkDecorator.java +++ b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHyperlinkDecorator.java @@ -19,15 +19,18 @@ package org.apache.poi.xwpf.model; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; +import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; /** * Decorator class for XWPFParagraph allowing to add hyperlinks * found in paragraph to its text. * - * TODO - add the hyperlink text in the right place, and not just - * at the end + * Note - adds the hyperlink at the end, not in the right place... + * + * @deprecated Use {@link XWPFHyperlinkRun} instead */ +@Deprecated public class XWPFHyperlinkDecorator extends XWPFParagraphDecorator { private StringBuffer hyperlinkText; diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHyperlinkRun.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHyperlinkRun.java new file mode 100644 index 000000000..9a40b52dd --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHyperlinkRun.java @@ -0,0 +1,64 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; + +/** + * A run of text with a Hyperlink applied to it. + * Any given Hyperlink may be made up of multiple of these. + */ +public class XWPFHyperlinkRun extends XWPFRun +{ + private CTHyperlink hyperlink; + + public XWPFHyperlinkRun(CTHyperlink hyperlink, CTR run, XWPFParagraph p) { + super(run, p); + this.hyperlink = hyperlink; + } + + public CTHyperlink getCTHyperlink() { + return hyperlink; + } + + public String getAnchor() { + return hyperlink.getAnchor(); + } + + /** + * Returns the ID of the hyperlink, if one is set. + */ + public String getHyperlinkId() { + return hyperlink.getId(); + } + public void setHyperlinkId(String id) { + hyperlink.setId(id); + } + + /** + * If this Hyperlink is an external reference hyperlink, + * return the object for it. + */ + public XWPFHyperlink getHyperlink(XWPFDocument document) { + String id = getHyperlinkId(); + if(id == null) + return null; + + return document.getHyperlinkByID(id); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 57e1a6467..2a57f6988 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -20,21 +20,19 @@ import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.Arrays; import org.apache.poi.util.Internal; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPBdr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTProofErr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; @@ -66,10 +64,6 @@ public class XWPFParagraph implements IBodyElement{ protected XWPFDocument document; protected List runs; - /** - * TODO - replace with RichText String - */ - private StringBuffer text = new StringBuffer(); private StringBuffer pictureText = new StringBuffer(); private StringBuffer footnoteText = new StringBuffer(); @@ -91,102 +85,76 @@ public class XWPFParagraph implements IBodyElement{ } runs = new ArrayList(); - if (prgrph.getRList().size() > 0) { - for(CTR ctRun : prgrph.getRList()) { - runs.add(new XWPFRun(ctRun, this)); - } - } - if (!isEmpty()) { - readNewText(); - } - } - - protected String readNewText() { - StringBuffer text = new StringBuffer(); - - // All the runs to loop over - // TODO - replace this with some sort of XPath expression - // to directly find all the CTRs, in the right order - ArrayList rs = new ArrayList(); - rs.addAll( paragraph.getRList() ); - - for (CTSdtRun sdt : paragraph.getSdtList()) { - CTSdtContentRun run = sdt.getSdtContent(); - rs.addAll( run.getRList() ); - } - for (CTRunTrackChange c : paragraph.getDelList()) { - rs.addAll( c.getRList() ); - } - for (CTRunTrackChange c : paragraph.getInsList()) { - rs.addAll( c.getRList() ); - } - for (CTSimpleField f : paragraph.getFldSimpleList()) { - rs.addAll( f.getRList() ); - } - - // Get text of the paragraph - for (int j = 0; j < rs.size(); j++) { - // Grab the text and tabs of the paragraph - // Do so in a way that preserves the ordering - XmlCursor c = rs.get(j).newCursor(); - c.selectPath("./*"); - while (c.toNextSelection()) { - XmlObject o = c.getObject(); - if (o instanceof CTText) { - String tagName = o.getDomNode().getNodeName(); - // Field Codes (w:instrText, defined in spec sec. 17.16.23) - // come up as instances of CTText, but we don't want them - // in the normal text output - if (!"w:instrText".equals(tagName)) { - text.append(((CTText) o).getStringValue()); - } - } - if (o instanceof CTPTab) { - text.append("\t"); - } - if (o instanceof CTEmpty) { - // Some inline text elements get returned not as - // themselves, but as CTEmpty, owing to some odd - // definitions around line 5642 of the XSDs - String tagName = o.getDomNode().getNodeName(); - if ("w:tab".equals(tagName)) { - text.append("\t"); - } - if ("w:cr".equals(tagName)) { - text.append("\n"); - } - } - - // Check for bits that only apply when - // attached to a core document - if(document != null) { - //got a reference to a footnote - if (o instanceof CTFtnEdnRef) { - CTFtnEdnRef ftn = (CTFtnEdnRef) o; - footnoteText.append("[").append(ftn.getId()).append(": "); - XWPFFootnote footnote = - ftn.getDomNode().getLocalName().equals("footnoteReference") ? - document.getFootnoteByID(ftn.getId().intValue()) : - document.getEndnoteByID(ftn.getId().intValue()); + // Get all our child nodes in order, and process them + // into XWPFRuns where we can + XmlCursor c = paragraph.newCursor(); + c.selectPath("child::*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if(o instanceof CTR) { + runs.add(new XWPFRun((CTR)o, this)); + } + if(o instanceof CTHyperlink) { + CTHyperlink link = (CTHyperlink)o; + for(CTR r : link.getRList()) { + runs.add(new XWPFHyperlinkRun(link, r, this)); + } + } + if(o instanceof CTSdtRun) { + CTSdtContentRun run = ((CTSdtRun)o).getSdtContent(); + for(CTR r : run.getRList()) { + runs.add(new XWPFRun(r, this)); + } + } + if(o instanceof CTRunTrackChange) { + for(CTR r : ((CTRunTrackChange)o).getRList()) { + runs.add(new XWPFRun(r, this)); + } + } + if(o instanceof CTSimpleField) { + for(CTR r : ((CTSimpleField)o).getRList()) { + runs.add(new XWPFRun(r, this)); + } + } + } + + // Look for bits associated with the runs + for(XWPFRun run : runs) { + CTR r = run.getCTR(); + + // Check for bits that only apply when + // attached to a core document + if(document != null) { + c = r.newCursor(); + c.selectPath("child::*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if(o instanceof CTFtnEdnRef) { + CTFtnEdnRef ftn = (CTFtnEdnRef)o; + footnoteText.append("[").append(ftn.getId()).append(": "); + XWPFFootnote footnote = + ftn.getDomNode().getLocalName().equals("footnoteReference") ? + document.getFootnoteByID(ftn.getId().intValue()) : + document.getEndnoteByID(ftn.getId().intValue()); - boolean first = true; - for (XWPFParagraph p : footnote.getParagraphs()) { - if (!first) { - footnoteText.append("\n"); - first = false; - } - footnoteText.append(p.getText()); - } + boolean first = true; + for (XWPFParagraph p : footnote.getParagraphs()) { + if (!first) { + footnoteText.append("\n"); + first = false; + } + footnoteText.append(p.getText()); + } - footnoteText.append("]"); - } - } + footnoteText.append("]"); + } + } } // Loop over pictures inside our // paragraph, looking for text in them - for(CTPicture pict : rs.get(j).getPictList()) { + for(CTPicture pict : r.getPictList()) { XmlObject[] t = pict .selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t"); for (int m = 0; m < t.length; m++) { @@ -200,9 +168,6 @@ public class XWPFParagraph implements IBodyElement{ } } } - - this.text = text; - return text.toString(); } @Internal @@ -228,7 +193,10 @@ public class XWPFParagraph implements IBodyElement{ */ public String getText() { StringBuffer out = new StringBuffer(); - out.append(text).append(footnoteText).append(pictureText); + for(XWPFRun run : runs) { + out.append(run.toString()); + } + out.append(footnoteText).append(pictureText); return out.toString(); } @@ -282,7 +250,11 @@ public class XWPFParagraph implements IBodyElement{ * paragraph */ public String getParagraphText() { - return text.toString(); + StringBuffer out = new StringBuffer(); + for(XWPFRun run : runs) { + out.append(run.toString()); + } + return out.toString(); } /** @@ -1143,9 +1115,6 @@ public class XWPFParagraph implements IBodyElement{ pos = paragraph.getRList().size(); paragraph.addNewR(); paragraph.setRArray(pos, run); - for (CTText ctText: paragraph.getRArray(pos).getTList()) { - this.text.append(ctText.getStringValue()); - } } /** diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java index dccd02bfb..2b37383b3 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java @@ -19,12 +19,15 @@ package org.apache.poi.xwpf.usermodel; import java.math.BigInteger; import org.apache.poi.util.Internal; +import org.apache.xmlbeans.XmlObject; import org.apache.xmlbeans.XmlString; import org.apache.xmlbeans.XmlCursor; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFonts; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHpsMeasure; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure; @@ -492,4 +495,45 @@ public class XWPFRun { } } + /** + * Returns the string version of the text, with tabs and + * carriage returns in place of their xml equivalents. + */ + public String toString() { + StringBuffer text = new StringBuffer(); + + // Grab the text and tabs of the text run + // Do so in a way that preserves the ordering + XmlCursor c = run.newCursor(); + c.selectPath("./*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTText) { + String tagName = o.getDomNode().getNodeName(); + // Field Codes (w:instrText, defined in spec sec. 17.16.23) + // come up as instances of CTText, but we don't want them + // in the normal text output + if (!"w:instrText".equals(tagName)) { + text.append(((CTText) o).getStringValue()); + } + } + if (o instanceof CTPTab) { + text.append("\t"); + } + if (o instanceof CTEmpty) { + // Some inline text elements get returned not as + // themselves, but as CTEmpty, owing to some odd + // definitions around line 5642 of the XSDs + String tagName = o.getDomNode().getNodeName(); + if ("w:tab".equals(tagName)) { + text.append("\t"); + } + if ("w:cr".equals(tagName)) { + text.append("\n"); + } + } + } + + return text.toString(); + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java index 2497d801a..008b5904b 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java @@ -327,7 +327,7 @@ public class XWPFTableCell implements IBody { public String getText(){ StringBuffer text = new StringBuffer(); for (XWPFParagraph p : paragraphs) { - text.append(p.readNewText()); + text.append(p.getText()); } return text.toString(); } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 5b47b6d0b..77315f795 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -96,22 +96,18 @@ public class TestXWPFWordExtractor extends TestCase { XWPFWordExtractor extractor = new XWPFWordExtractor(doc); // Now check contents - // TODO - fix once correctly handling contents extractor.setFetchHyperlinks(false); assertEquals( -// "This is a test document\nThis bit is in bold and italic\n" + -// "Back to normal\nWe have a hyperlink here, and another.\n", - "This is a test document\nThis bit is in bold and italic\n" + - "Back to normal\nWe have a here, and .hyperlinkanother\n", + "This is a test document\nThis bit is in bold and italic\n" + + "Back to normal\nWe have a hyperlink here, and another.\n", extractor.getText() ); + // One hyperlink is a real one, one is just to the top of page extractor.setFetchHyperlinks(true); assertEquals( -// "This is a test document\nThis bit is in bold and italic\n" + -// "Back to normal\nWe have a hyperlink here, and another.\n", - "This is a test document\nThis bit is in bold and italic\n" + - "Back to normal\nWe have a here, and .hyperlink another\n", + "This is a test document\nThis bit is in bold and italic\n" + + "Back to normal\nWe have a hyperlink here, and another.\n", extractor.getText() ); } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java index d30503b0a..86b22466b 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java @@ -144,7 +144,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase { policy = oddEven.getHeaderFooterPolicy(); assertEquals( - "[]ODD Page Header text\n\n", + "[ODD Page Header text]\n\n", policy.getDefaultHeader().getText() ); assertEquals(