Support for extraction of footnotes from docx files, see Bugzilla 45556

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@795328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-07-18 09:09:59 +00:00
parent 3304fa1887
commit fa31a65d14
14 changed files with 471 additions and 381 deletions

View File

@ -33,6 +33,8 @@
<changes>
<release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="fix">45556 - Fixed ExtractorFactory to support .xltx and .dotx files</action>
<action dev="POI-DEVELOPERS" type="add">45556 - Support for extraction of footnotes from docx files</action>
<action dev="POI-DEVELOPERS" type="add">47520 - Initial support for custom XML mappings in XSSF</action>
<action dev="POI-DEVELOPERS" type="fix">47460 - Fixed NPE when retrieving core properties from a newly created workbook</action>
<action dev="POI-DEVELOPERS" type="fix">47498 - Fixed HyperlinkRecord to properly handle URL monikers</action>
@ -41,7 +43,7 @@
<action dev="POI-DEVELOPERS" type="fix">47448 - Allow HSSFEventFactory to handle non-zero padding at the end of the workbook stream</action>
<action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
<action dev="POI-DEVELOPERS" type="add">47400 - Support for text extraction of footnotes, endnotes and comments in HWPF</action>
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
<action dev="POI-DEVELOPERS" type="fix">47412 - Fixed concurrency issue with EscherProperties.initProps()</action>
<action dev="POI-DEVELOPERS" type="fix">47143 - Fixed OOM in HSSFWorkbook#getAllPictures when reading .xls files containing metafiles</action>

View File

@ -66,6 +66,24 @@ public final class XSSFRelation extends POIXMLRelation {
"/xl/workbook.xml",
null
);
public static final XSSFRelation TEMPLATE_WORKBOOK = new XSSFRelation(
"application/vnd.openxmlformats-officedocument.spreadsheetml.template.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/xl/workbook.xml",
null
);
public static final XSSFRelation MACRO_TEMPLATE_WORKBOOK = new XSSFRelation(
"application/vnd.ms-excel.template.macroEnabled.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/xl/workbook.xml",
null
);
public static final XSSFRelation MACRO_ADDIN_WORKBOOK = new XSSFRelation(
"application/vnd.ms-excel.addin.macroEnabled.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/xl/workbook.xml",
null
);
public static final XSSFRelation WORKSHEET = new XSSFRelation(
"application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet",

View File

@ -19,7 +19,7 @@ package org.apache.poi.xwpf.model;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/**
* Decorator class for XWPFParagraph allowing to add hyperlinks

View File

@ -30,15 +30,7 @@ import org.apache.xmlbeans.XmlOptions;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.*;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTComment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CommentsDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import javax.xml.namespace.QName;
@ -60,6 +52,7 @@ public class XWPFDocument extends POIXMLDocument {
protected List<XWPFHyperlink> hyperlinks;
protected List<XWPFParagraph> paragraphs;
protected List<XWPFTable> tables;
protected Map<Integer, XWPFFootnote> footnotes;
/** Handles the joy of different headers/footers for different pages */
private XWPFHeaderFooterPolicy headerFooterPolicy;
@ -87,6 +80,7 @@ public class XWPFDocument extends POIXMLDocument {
comments = new ArrayList<XWPFComment>();
paragraphs = new ArrayList<XWPFParagraph>();
tables= new ArrayList<XWPFTable>();
footnotes = new HashMap<Integer, XWPFFootnote>();
try {
DocumentDocument doc = DocumentDocument.Factory.parse(getPackagePart().getInputStream());
@ -94,6 +88,8 @@ public class XWPFDocument extends POIXMLDocument {
CTBody body = ctDocument.getBody();
initFootnotes();
// filling paragraph list
for (CTP p : body.getPArray()) {
paragraphs.add(new XWPFParagraph(p, this));
@ -101,7 +97,7 @@ public class XWPFDocument extends POIXMLDocument {
// Get any tables
for(CTTbl table : body.getTblArray()) {
tables.add(new XWPFTable(table));
tables.add(new XWPFTable(this, table));
}
// Sort out headers and footers
@ -118,7 +114,6 @@ public class XWPFDocument extends POIXMLDocument {
}
initHyperlinks();
} catch (XmlException e) {
throw new POIXMLException(e);
}
@ -139,6 +134,19 @@ public class XWPFDocument extends POIXMLDocument {
}
}
private void initFootnotes() throws XmlException, IOException {
for(POIXMLDocumentPart p : getRelations()){
String relation = p.getPackageRelationship().getRelationshipType();
if(relation.equals(XWPFRelation.FOOTNOTE.getRelation())){
FootnotesDocument footnotesDocument = FootnotesDocument.Factory.parse(p.getPackagePart().getInputStream());
for(CTFtnEdn ctFtnEdn : footnotesDocument.getFootnotes().getFootnoteArray()) {
footnotes.put(ctFtnEdn.getId().intValue(), new XWPFFootnote(this, ctFtnEdn));
}
}
}
}
/**
* Create a new SpreadsheetML package and setup the default minimal content
*/
@ -205,6 +213,15 @@ public class XWPFDocument extends POIXMLDocument {
return null;
}
public XWPFFootnote getFootnoteByID(int id) {
return footnotes.get(id);
}
public Collection<XWPFFootnote> getFootnotes() {
return footnotes == null ? new ArrayList<XWPFFootnote>() : footnotes.values();
}
public XWPFHyperlink[] getHyperlinks() {
return hyperlinks.toArray(
new XWPFHyperlink[hyperlinks.size()]
@ -323,7 +340,7 @@ public class XWPFDocument extends POIXMLDocument {
* @return a new table
*/
public XWPFTable createTable(){
return new XWPFTable(ctDocument.getBody().addNewTbl());
return new XWPFTable(this, ctDocument.getBody().addNewTbl());
}
/**
@ -333,7 +350,7 @@ public class XWPFDocument extends POIXMLDocument {
* @return table
*/
public XWPFTable createTable(int rows, int cols) {
return new XWPFTable(ctDocument.getBody().addNewTbl(), rows, cols);
return new XWPFTable(this, ctDocument.getBody().addNewTbl(), rows, cols);
}
}

View File

@ -0,0 +1,43 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdn;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
public class XWPFFootnote implements Iterable<XWPFParagraph> {
private List<XWPFParagraph> paragraphs = new ArrayList<XWPFParagraph>();
public XWPFFootnote(XWPFDocument document, CTFtnEdn body) {
for (CTP p : body.getPArray()) {
paragraphs.add(new XWPFParagraph(p, document));
}
}
public List<XWPFParagraph> getParagraphs() {
return paragraphs;
}
public Iterator<XWPFParagraph> iterator(){
return paragraphs.iterator();
}
}

View File

@ -65,7 +65,8 @@ public abstract class XWPFHeaderFooter {
new XWPFTable[headerFooter.getTblArray().length];
for(int i=0; i<tables.length; i++) {
tables[i] = new XWPFTable(
headerFooter.getTblArray(i)
null,
headerFooter.getTblArray(i)
);
}
return tables;

View File

@ -21,26 +21,7 @@ import java.util.ArrayList;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPBdr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSpacing;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTextAlignment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STBorder;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STJc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STLineSpacingRule;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STTextAlignment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
@ -58,6 +39,7 @@ public class XWPFParagraph {
*/
private StringBuffer text = new StringBuffer();
private StringBuffer pictureText = new StringBuffer();
private StringBuffer footnoteText = new StringBuffer();
protected XWPFParagraph(CTP prgrph) {
@ -66,79 +48,96 @@ public class XWPFParagraph {
protected XWPFParagraph(CTP prgrph, XWPFDocument docRef) {
this.paragraph = prgrph;
this.document = docRef;
this.paragraph = prgrph;
this.document = docRef;
if (!isEmpty()) {
// All the runs to loop over
// TODO - replace this with some sort of XPath expression
// to directly find all the CTRs, in the right order
ArrayList<CTR> rs = new ArrayList<CTR>();
CTR[] tmp;
if (!isEmpty()) {
// All the runs to loop over
// TODO - replace this with some sort of XPath expression
// to directly find all the CTRs, in the right order
ArrayList<CTR> rs = new ArrayList<CTR>();
CTR[] tmp;
// Get the main text runs
tmp = paragraph.getRArray();
for (int i = 0; i < tmp.length; i++) {
rs.add(tmp[i]);
}
// Get the main text runs
tmp = paragraph.getRArray();
for (int i = 0; i < tmp.length; i++) {
rs.add(tmp[i]);
}
// Not sure quite what these are, but they hold
// more text runs
CTSdtRun[] sdts = paragraph.getSdtArray();
for (int i = 0; i < sdts.length; i++) {
CTSdtContentRun run = sdts[i].getSdtContent();
tmp = run.getRArray();
for (int j = 0; j < tmp.length; j++) {
rs.add(tmp[j]);
}
}
// Not sure quite what these are, but they hold
// more text runs
CTSdtRun[] sdts = paragraph.getSdtArray();
for (int i = 0; i < sdts.length; i++) {
CTSdtContentRun run = sdts[i].getSdtContent();
tmp = run.getRArray();
for (int j = 0; j < tmp.length; j++) {
rs.add(tmp[j]);
}
}
// Get text of the paragraph
for (int j = 0; j < rs.size(); j++) {
// Grab the text and tabs of the paragraph
// Do so in a way that preserves the ordering
XmlCursor c = rs.get(j).newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTText) {
text.append(((CTText) o).getStringValue());
}
if (o instanceof CTPTab) {
text.append("\t");
}
}
// Get text of the paragraph
for (int j = 0; j < rs.size(); j++) {
// Grab the text and tabs of the paragraph
// Do so in a way that preserves the ordering
XmlCursor c = rs.get(j).newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTText) {
text.append(((CTText) o).getStringValue());
}
if (o instanceof CTPTab) {
text.append("\t");
}
//got a reference to a footnote
if (o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef) o;
footnoteText.append("[").append(ftn.getId()).append(": ");
XWPFFootnote footnote = document.getFootnoteByID(ftn.getId().intValue());
// Loop over pictures inside our
// paragraph, looking for text in them
CTPicture[] picts = rs.get(j).getPictArray();
for (int k = 0; k < picts.length; k++) {
XmlObject[] t = picts[k]
.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
for (int m = 0; m < t.length; m++) {
NodeList kids = t[m].getDomNode().getChildNodes();
for (int n = 0; n < kids.getLength(); n++) {
if (kids.item(n) instanceof Text) {
pictureText.append("\n");
pictureText.append(kids.item(n).getNodeValue());
}
}
}
}
}
}
boolean first = true;
for (XWPFParagraph p : footnote.getParagraphs()) {
if (!first) {
footnoteText.append("\n");
first = false;
}
footnoteText.append(p.getText());
}
footnoteText.append("]");
}
}
// Loop over pictures inside our
// paragraph, looking for text in them
CTPicture[] picts = rs.get(j).getPictArray();
for (int k = 0; k < picts.length; k++) {
XmlObject[] t = picts[k]
.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
for (int m = 0; m < t.length; m++) {
NodeList kids = t[m].getDomNode().getChildNodes();
for (int n = 0; n < kids.getLength(); n++) {
if (kids.item(n) instanceof Text) {
pictureText.append("\n");
pictureText.append(kids.item(n).getNodeValue());
}
}
}
}
}
}
}
public CTP getCTP() {
return paragraph;
return paragraph;
}
public boolean isEmpty() {
return !paragraph.getDomNode().hasChildNodes();
return !paragraph.getDomNode().hasChildNodes();
}
public XWPFDocument getDocument() {
return document;
return document;
}
/**
@ -146,7 +145,9 @@ public class XWPFParagraph {
* in it.
*/
public String getText() {
return getParagraphText() + getPictureText();
StringBuffer out = new StringBuffer();
out.append(text).append(footnoteText).append(pictureText);
return out.toString();
}
/**
@ -154,14 +155,23 @@ public class XWPFParagraph {
* paragraph
*/
public String getParagraphText() {
return text.toString();
return text.toString();
}
/**
* Returns any text from any suitable pictures in the paragraph
*/
public String getPictureText() {
return pictureText.toString();
return pictureText.toString();
}
/**
* Returns the footnote text of the paragraph
*
* @return the footnote text or empty string if the paragraph does not have footnotes
*/
public String getFootnoteText() {
return footnoteText.toString();
}
/**
@ -170,7 +180,7 @@ public class XWPFParagraph {
* @return a new text run
*/
public XWPFRun createRun() {
return new XWPFRun(paragraph.addNewR(), this);
return new XWPFRun(paragraph.addNewR(), this);
}
/**
@ -350,12 +360,12 @@ public class XWPFParagraph {
* @see Borders a list of all types of borders
*/
public void setBorderBottom(Borders border) {
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetBottom() ? ct.getBottom() : ct.addNewBottom();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetBottom();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetBottom() ? ct.getBottom() : ct.addNewBottom();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetBottom();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
}
/**
@ -367,13 +377,13 @@ public class XWPFParagraph {
* @see Borders a list of all types of borders
*/
public Borders getBorderBottom() {
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getBottom();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getBottom();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
}
/**
@ -399,12 +409,12 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public void setBorderLeft(Borders border) {
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetLeft() ? ct.getLeft() : ct.addNewLeft();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetLeft();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetLeft() ? ct.getLeft() : ct.addNewLeft();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetLeft();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
}
/**
@ -416,13 +426,13 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public Borders getBorderLeft() {
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getLeft();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getLeft();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
}
/**
@ -448,12 +458,12 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public void setBorderRight(Borders border) {
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetRight() ? ct.getRight() : ct.addNewRight();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetRight();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetRight() ? ct.getRight() : ct.addNewRight();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetRight();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
}
/**
@ -465,13 +475,13 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public Borders getBorderRight() {
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getRight();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getRight();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
}
/**
@ -501,12 +511,12 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public void setBorderBetween(Borders border) {
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetBetween() ? ct.getBetween() : ct.addNewBetween();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetBetween();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
CTPBdr ct = getCTPBrd(true);
CTBorder pr = ct.isSetBetween() ? ct.getBetween() : ct.addNewBetween();
if (border.getValue() == Borders.NONE.getValue())
ct.unsetBetween();
else
pr.setVal(STBorder.Enum.forInt(border.getValue()));
}
/**
@ -518,13 +528,13 @@ public class XWPFParagraph {
* @see Borders for a list of all possible borders
*/
public Borders getBorderBetween() {
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getBetween();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
CTPBdr border = getCTPBrd(false);
CTBorder ct = null;
if (border != null) {
ct = border.getBetween();
}
STBorder.Enum ptrn = ct != null ? ct.getVal() : STBorder.NONE;
return Borders.valueOf(ptrn.intValue());
}
/**
@ -544,13 +554,13 @@ public class XWPFParagraph {
* boolean value
*/
public void setPageBreak(boolean pageBreak) {
CTPPr ppr = getCTPPr();
CTOnOff ct_pageBreak = ppr.isSetPageBreakBefore() ? ppr
.getPageBreakBefore() : ppr.addNewPageBreakBefore();
if (pageBreak)
ct_pageBreak.setVal(STOnOff.TRUE);
else
ct_pageBreak.setVal(STOnOff.FALSE);
CTPPr ppr = getCTPPr();
CTOnOff ct_pageBreak = ppr.isSetPageBreakBefore() ? ppr
.getPageBreakBefore() : ppr.addNewPageBreakBefore();
if (pageBreak)
ct_pageBreak.setVal(STOnOff.TRUE);
else
ct_pageBreak.setVal(STOnOff.FALSE);
}
/**
@ -569,14 +579,14 @@ public class XWPFParagraph {
* @return boolean - if page break is set
*/
public boolean isPageBreak() {
CTPPr ppr = getCTPPr();
CTOnOff ct_pageBreak = ppr.isSetPageBreakBefore() ? ppr
.getPageBreakBefore() : null;
if (ct_pageBreak != null
&& ct_pageBreak.getVal().intValue() == STOnOff.INT_TRUE)
return true;
else
return false;
CTPPr ppr = getCTPPr();
CTOnOff ct_pageBreak = ppr.isSetPageBreakBefore() ? ppr
.getPageBreakBefore() : null;
if (ct_pageBreak != null
&& ct_pageBreak.getVal().intValue() == STOnOff.INT_TRUE)
return true;
else
return false;
}
/**
@ -902,12 +912,12 @@ public class XWPFParagraph {
* @param wrap - boolean
*/
public void setWordWrap(boolean wrap) {
CTOnOff wordWrap = getCTPPr().isSetWordWrap() ? getCTPPr()
.getWordWrap() : getCTPPr().addNewWordWrap();
if (wrap)
wordWrap.setVal(STOnOff.TRUE);
else
wordWrap.unsetVal();
CTOnOff wordWrap = getCTPPr().isSetWordWrap() ? getCTPPr()
.getWordWrap() : getCTPPr().addNewWordWrap();
if (wrap)
wordWrap.setVal(STOnOff.TRUE);
else
wordWrap.unsetVal();
}
/**
@ -919,14 +929,14 @@ public class XWPFParagraph {
* @return boolean
*/
public boolean isWordWrap() {
CTOnOff wordWrap = getCTPPr().isSetWordWrap() ? getCTPPr()
.getWordWrap() : null;
if (wordWrap != null) {
return (wordWrap.getVal() == STOnOff.ON
|| wordWrap.getVal() == STOnOff.TRUE || wordWrap.getVal() == STOnOff.X_1) ? true
: false;
} else
return false;
CTOnOff wordWrap = getCTPPr().isSetWordWrap() ? getCTPPr()
.getWordWrap() : null;
if (wordWrap != null) {
return (wordWrap.getVal() == STOnOff.ON
|| wordWrap.getVal() == STOnOff.TRUE || wordWrap.getVal() == STOnOff.X_1) ? true
: false;
} else
return false;
}
/**

View File

@ -40,6 +40,24 @@ public final class XWPFRelation extends POIXMLRelation {
"/word/document.xml",
null
);
public static final XWPFRelation TEMPLATE = new XWPFRelation(
"application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/word/document.xml",
null
);
public static final XWPFRelation MACRO_DOCUMENT = new XWPFRelation(
"application/vnd.ms-word.document.macroEnabled.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/word/document.xml",
null
);
public static final XWPFRelation MACRO_TEMPLATE_DOCUMENT = new XWPFRelation(
"application/vnd.ms-word.template.macroEnabledTemplate.main+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument",
"/word/document.xml",
null
);
public static final XWPFRelation FONT_TABLE = new XWPFRelation(
"application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable",
@ -88,6 +106,12 @@ public final class XWPFRelation extends POIXMLRelation {
null,
null
);
public static final XWPFRelation FOOTNOTE = new XWPFRelation(
null,
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes",
null,
null
);
private XWPFRelation(String type, String rel, String defaultName, Class<? extends POIXMLDocumentPart> cls) {

View File

@ -42,8 +42,8 @@ public class XWPFTable {
private CTTbl ctTbl;
public XWPFTable(CTTbl table, int row, int col) {
this(table);
public XWPFTable(XWPFDocument doc, CTTbl table, int row, int col) {
this(doc, table);
for (int i = 0; i < row; i++) {
XWPFTableRow tabRow = (getRow(i) == null) ? createRow() : getRow(i);
for (int k = 0; k < col; k++) {
@ -54,7 +54,7 @@ public class XWPFTable {
}
public XWPFTable(CTTbl table) {
public XWPFTable(XWPFDocument doc, CTTbl table) {
this.ctTbl = table;
// is an empty table: I add one row and one column as default
@ -65,7 +65,7 @@ public class XWPFTable {
StringBuffer rowText = new StringBuffer();
for (CTTc cell : row.getTcArray()) {
for (CTP ctp : cell.getPArray()) {
XWPFParagraph p = new XWPFParagraph(ctp, null);
XWPFParagraph p = new XWPFParagraph(ctp, doc);
if (rowText.length() > 0) {
rowText.append('\t');
}

View File

@ -17,6 +17,7 @@
package org.apache.poi.xwpf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
@ -27,202 +28,176 @@ import junit.framework.TestCase;
* Tests for HXFWordExtractor
*/
public class TestXWPFWordExtractor extends TestCase {
/**
* A very simple file
*/
private XWPFDocument xmlA;
private File fileA;
/**
* A fairly complex file
*/
private XWPFDocument xmlB;
private File fileB;
/**
* With a simplish header+footer
*/
private XWPFDocument xmlC;
private File fileC;
/**
* With different header+footer on first/rest
*/
private XWPFDocument xmlD;
private File fileD;
/**
* File with hyperlinks
*/
private XWPFDocument xmlE;
private File fileE;
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
XWPFDocument doc = open("sample.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
protected void setUp() throws Exception {
super.setUp();
String text = extractor.getText();
assertTrue(text.length() > 0);
fileA = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "sample.docx"
);
fileB = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
fileC = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "ThreeColHeadFoot.docx"
);
fileD = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "DiffFirstPageHeadFoot.docx"
);
fileE = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "TestDocument.docx"
);
assertTrue(fileA.exists());
assertTrue(fileB.exists());
assertTrue(fileC.exists());
assertTrue(fileD.exists());
assertTrue(fileE.exists());
// Check contents
assertTrue(text.startsWith(
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
));
assertTrue(text.endsWith(
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
));
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
}
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if (t[i] == '\n') {
ps++;
}
}
assertEquals(3, ps);
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
new XWPFWordExtractor(xmlA);
new XWPFWordExtractor(POIXMLDocument.openPackage(fileA.toString()));
/**
* Tests getting the text out of a complex file
*/
public void testGetComplexText() throws Exception {
XWPFDocument doc = open("IllustrativeCases.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlA);
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check contents
assertTrue(text.startsWith(
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
));
assertTrue(text.endsWith(
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
));
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; }
}
assertEquals(3, ps);
}
/**
* Tests getting the text out of a complex file
*/
public void testGetComplexText() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlB);
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
char euro = '\u20ac';
char euro = '\u20ac';
// System.err.println("'"+text.substring(text.length() - 40) + "'");
// Check contents
assertTrue(text.startsWith(
" \n(V) ILLUSTRATIVE CASES\n\n"
));
assertTrue(text.contains(
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
));
assertTrue(text.endsWith(
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n"
));
// Check contents
assertTrue(text.startsWith(
" \n(V) ILLUSTRATIVE CASES\n\n"
));
assertTrue(text.contains(
"As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
));
assertTrue(text.endsWith(
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n"
));
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; }
}
assertEquals(103, ps);
}
// Check number of paragraphs
int ps = 0;
char[] t = text.toCharArray();
for (int i = 0; i < t.length; i++) {
if (t[i] == '\n') {
ps++;
}
}
assertEquals(103, ps);
}
public void testGetWithHyperlinks() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlE);
extractor.getText();
extractor.setFetchHyperlinks(true);
extractor.getText();
public void testGetWithHyperlinks() throws Exception {
XWPFDocument doc = open("TestDocument.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
extractor.getText()
);
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
extractor.getText()
);
extractor.setFetchHyperlinks(true);
assertEquals(
extractor.setFetchHyperlinks(true);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
extractor.getText()
);
}
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
extractor.getText()
);
}
public void testHeadersFooters() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC);
extractor.getText();
public void testHeadersFooters() throws Exception {
XWPFDocument doc = open("ThreeColHeadFoot.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals(
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
assertEquals(
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
// Now another file, expect multiple headers
// and multiple footers
doc = open("DiffFirstPageHeadFoot.docx");
extractor = new XWPFWordExtractor(doc);
extractor =
new XWPFWordExtractor(doc);
extractor.getText();
assertEquals(
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"The footer of the first page\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
}
public void testFootnotes() throws Exception {
XWPFDocument doc = open("footnotes.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertTrue(extractor.getText().contains("snoska"));
}
// Now another file, expect multiple headers
// and multiple footers
extractor =
new XWPFWordExtractor(xmlD);
extractor.getText();
public void testTableFootnotes() throws Exception {
XWPFDocument doc = open("table_footnotes.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertEquals(
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"The footer of the first page\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
}
assertTrue(extractor.getText().contains("snoska"));
}
public void testFormFootnotes() throws Exception {
XWPFDocument doc = open("form_footnotes.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc"));
assertTrue("Unable to find expected word in text\n" + text, text.contains("test phrase"));
}
//TODO use the same logic as in HSSFTestDataSamples
private XWPFDocument open(String sampleFileName) throws IOException {
File file = new File(
System.getProperty("HWPF.testdata.path"), sampleFileName);
try {
if(!sampleFileName.equals(file.getCanonicalFile().getName())){
throw new RuntimeException("File name is case-sensitive: requested '" + sampleFileName
+ "' but actual file is '" + file.getCanonicalFile().getName() + "'");
}
} catch (IOException e){
throw new RuntimeException(e);
}
return new XWPFDocument(POIXMLDocument.openPackage(file.getPath()));
}
}

View File

@ -43,14 +43,14 @@ public class TestXWPFTable extends TestCase {
public void testConstructor() {
CTTbl ctTable=CTTbl.Factory.newInstance();
XWPFTable xtab=new XWPFTable(ctTable);
XWPFTable xtab=new XWPFTable(null, ctTable);
assertNotNull(xtab);
assertEquals(1,ctTable.sizeOfTrArray());
assertEquals(1,ctTable.getTrArray(0).sizeOfTcArray());
assertNotNull(ctTable.getTrArray(0).getTcArray(0).getPArray(0));
ctTable=CTTbl.Factory.newInstance();
xtab=new XWPFTable(ctTable, 3,2);
xtab=new XWPFTable(null, ctTable, 3,2);
assertNotNull(xtab);
assertEquals(3,ctTable.sizeOfTrArray());
assertEquals(2,ctTable.getTrArray(0).sizeOfTcArray());
@ -67,7 +67,7 @@ public class TestXWPFTable extends TestCase {
CTText text=run.addNewT();
text.setStringValue("finally I can write!");
XWPFTable xtab=new XWPFTable(table);
XWPFTable xtab=new XWPFTable(null, table);
assertEquals("finally I can write!\n",xtab.getText());
}
@ -84,7 +84,7 @@ public class TestXWPFTable extends TestCase {
r3.addNewTc().addNewP();
r3.addNewTc().addNewP();
XWPFTable xtab=new XWPFTable(table);
XWPFTable xtab=new XWPFTable(null, table);
assertEquals(3,xtab.getNumberOfRows());
assertNotNull(xtab.getRow(2));
@ -95,7 +95,7 @@ public class TestXWPFTable extends TestCase {
assertEquals(2,table.getTrArray(0).sizeOfTcArray());
//check creation of first row
xtab=new XWPFTable(CTTbl.Factory.newInstance());
xtab=new XWPFTable(null, CTTbl.Factory.newInstance());
assertEquals(1,xtab.getCTTbl().getTrArray(0).sizeOfTcArray());
}
@ -104,7 +104,7 @@ public class TestXWPFTable extends TestCase {
CTTbl table = CTTbl.Factory.newInstance();
table.addNewTblPr().addNewTblW().setW(new BigInteger("1000"));
XWPFTable xtab=new XWPFTable(table);
XWPFTable xtab=new XWPFTable(null, table);
assertEquals(1000,xtab.getWidth());