Improve paragraph text stuff, and further header tests
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684273 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c96a79fdf9
commit
0740316520
@ -37,6 +37,8 @@
|
||||
|
||||
<!-- Don't forget to update status.xml too! -->
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
|
||||
|
@ -34,6 +34,8 @@
|
||||
<!-- Don't forget to update changes.xml too! -->
|
||||
<changes>
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
|
||||
|
@ -39,7 +39,8 @@ public abstract class XWPFHeaderFooter {
|
||||
* Returns the paragraph(s) that holds
|
||||
* the text of the header or footer.
|
||||
* Normally there is only the one paragraph, but
|
||||
* there could be more in certain cases.
|
||||
* there could be more in certain cases, or
|
||||
* a table.
|
||||
*/
|
||||
public XWPFParagraph[] getParagraphs() {
|
||||
XWPFParagraph[] paras =
|
||||
@ -51,6 +52,24 @@ public abstract class XWPFHeaderFooter {
|
||||
}
|
||||
return paras;
|
||||
}
|
||||
/**
|
||||
* Return the table(s) that holds the text
|
||||
* of the header or footer, for complex cases
|
||||
* where a paragraph isn't used.
|
||||
* Normally there's just one paragraph, but some
|
||||
* complex headers/footers have a table or two
|
||||
* in addition.
|
||||
*/
|
||||
public XWPFTable[] getTables() {
|
||||
XWPFTable[] tables =
|
||||
new XWPFTable[headerFooter.getTblArray().length];
|
||||
for(int i=0; i<tables.length; i++) {
|
||||
tables[i] = new XWPFTable(
|
||||
headerFooter.getTblArray(i)
|
||||
);
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the textual content of the header/footer,
|
||||
@ -58,11 +77,21 @@ public abstract class XWPFHeaderFooter {
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer t = new StringBuffer();
|
||||
|
||||
XWPFParagraph[] paras = getParagraphs();
|
||||
for (int i = 0; i < paras.length; i++) {
|
||||
for(int i=0; i<paras.length; i++) {
|
||||
if(! paras[i].isEmpty()) {
|
||||
t.append(paras[i].getText());
|
||||
t.append('\n');
|
||||
}
|
||||
}
|
||||
|
||||
XWPFTable[] tables = getTables();
|
||||
for(int i=0; i<tables.length; i++) {
|
||||
t.append(tables[i].getText());
|
||||
t.append('\n');
|
||||
}
|
||||
|
||||
return t.toString();
|
||||
}
|
||||
}
|
||||
|
@ -16,6 +16,8 @@
|
||||
==================================================================== */
|
||||
package org.apache.poi.xwpf.usermodel;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.poi.xwpf.XWPFDocument;
|
||||
import org.apache.poi.xwpf.model.XMLParagraph;
|
||||
import org.apache.xmlbeans.XmlCursor;
|
||||
@ -24,6 +26,10 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.w3c.dom.Text;
|
||||
@ -43,15 +49,37 @@ public class XWPFParagraph extends XMLParagraph
|
||||
public XWPFParagraph(CTP prgrph, XWPFDocument docRef)
|
||||
{
|
||||
super(prgrph);
|
||||
|
||||
this.docRef = docRef;
|
||||
CTR[] rs = paragraph.getRArray();
|
||||
|
||||
// All the runs to loop over
|
||||
// TODO - replace this with some sort of XPath expression
|
||||
// to directly find all the CTRs, in the right order
|
||||
ArrayList<CTR> rs = new ArrayList<CTR>();
|
||||
CTR[] tmp;
|
||||
|
||||
// Get the main text runs
|
||||
tmp = paragraph.getRArray();
|
||||
for(int i=0; i<tmp.length; i++) {
|
||||
rs.add(tmp[i]);
|
||||
}
|
||||
|
||||
// Not sure quite what these are, but they hold
|
||||
// more text runs
|
||||
CTSdtRun[] sdts = paragraph.getSdtArray();
|
||||
for(int i=0; i<sdts.length; i++) {
|
||||
CTSdtContentRun run = sdts[i].getSdtContent();
|
||||
tmp = run.getRArray();
|
||||
for(int j=0; j<tmp.length; j++) {
|
||||
rs.add(tmp[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get text of the paragraph
|
||||
for (int j = 0; j < rs.length; j++) {
|
||||
for (int j = 0; j < rs.size(); j++) {
|
||||
// Grab the text and tabs of the paragraph
|
||||
// Do so in a way that preserves the ordering
|
||||
XmlCursor c = rs[j].newCursor();
|
||||
XmlCursor c = rs.get(j).newCursor();
|
||||
c.selectPath( "./*" );
|
||||
while(c.toNextSelection()) {
|
||||
XmlObject o = c.getObject();
|
||||
@ -65,7 +93,7 @@ public class XWPFParagraph extends XMLParagraph
|
||||
|
||||
// Loop over pictures inside our
|
||||
// paragraph, looking for text in them
|
||||
CTPicture[] picts = rs[j].getPictArray();
|
||||
CTPicture[] picts = rs.get(j).getPictArray();
|
||||
for (int k = 0; k < picts.length; k++) {
|
||||
XmlObject[] t = picts[k].selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
|
||||
for (int m = 0; m < t.length; m++) {
|
||||
|
@ -32,24 +32,26 @@ public class XWPFTable
|
||||
{
|
||||
protected StringBuffer text=new StringBuffer();
|
||||
|
||||
public XWPFTable(CTTbl table)
|
||||
{
|
||||
for(CTRow row : table.getTrArray())
|
||||
{
|
||||
for(CTTc cell : row.getTcArray())
|
||||
{
|
||||
for(CTP ctp : cell.getPArray())
|
||||
{
|
||||
public XWPFTable(CTTbl table) {
|
||||
for(CTRow row : table.getTrArray()) {
|
||||
StringBuffer rowText = new StringBuffer();
|
||||
for(CTTc cell : row.getTcArray()) {
|
||||
for(CTP ctp : cell.getPArray()) {
|
||||
XWPFParagraph p = new XWPFParagraph(ctp);
|
||||
this.text.append(p.getText()+"\t");
|
||||
if(rowText.length() > 0) {
|
||||
rowText.append('\t');
|
||||
}
|
||||
rowText.append(p.getText());
|
||||
}
|
||||
}
|
||||
this.text.append("\n");
|
||||
if(rowText.length() > 0) {
|
||||
this.text.append(rowText);
|
||||
this.text.append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getText()
|
||||
{
|
||||
public String getText() {
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
@ -111,7 +111,7 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
char euro = '\u20ac';
|
||||
// System.err.println("'"+text.substring(text.length() - 20) + "'");
|
||||
// System.err.println("'"+text.substring(text.length() - 40) + "'");
|
||||
|
||||
// Check contents
|
||||
assertTrue(text.startsWith(
|
||||
@ -121,7 +121,7 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
|
||||
));
|
||||
assertTrue(text.endsWith(
|
||||
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n"
|
||||
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n"
|
||||
));
|
||||
|
||||
// Check number of paragraphs
|
||||
|
@ -165,7 +165,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
|
||||
public void testContents() throws Exception {
|
||||
XWPFHeaderFooterPolicy policy;
|
||||
|
||||
// Just test a few bits
|
||||
// Test a few simple bits off a simple header
|
||||
policy = diffFirst.getHeaderFooterPolicy();
|
||||
|
||||
assertEquals(
|
||||
@ -176,5 +176,18 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
|
||||
"First header column!\tMid header\tRight header!\n",
|
||||
policy.getDefaultHeader().getText()
|
||||
);
|
||||
|
||||
|
||||
// And a few bits off a more complex header
|
||||
policy = oddEven.getHeaderFooterPolicy();
|
||||
|
||||
assertEquals(
|
||||
"\n[]ODD Page Header text\n\n",
|
||||
policy.getDefaultHeader().getText()
|
||||
);
|
||||
assertEquals(
|
||||
"\n[This is an Even Page, with a Header]\n\n",
|
||||
policy.getEvenPageHeader().getText()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user