Improve paragraph text stuff, and further header tests

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684273 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-09 14:50:16 +00:00
parent c96a79fdf9
commit 0740316520
7 changed files with 99 additions and 23 deletions

View File

@ -37,6 +37,8 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>

View File

@ -34,6 +34,8 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>

View File

@ -39,7 +39,8 @@ public abstract class XWPFHeaderFooter {
* Returns the paragraph(s) that holds
* the text of the header or footer.
* Normally there is only the one paragraph, but
* there could be more in certain cases.
* there could be more in certain cases, or
* a table.
*/
public XWPFParagraph[] getParagraphs() {
XWPFParagraph[] paras =
@ -51,6 +52,24 @@ public abstract class XWPFHeaderFooter {
}
return paras;
}
/**
* Return the table(s) that holds the text
* of the header or footer, for complex cases
* where a paragraph isn't used.
* Normally there's just one paragraph, but some
* complex headers/footers have a table or two
* in addition.
*/
public XWPFTable[] getTables() {
XWPFTable[] tables =
new XWPFTable[headerFooter.getTblArray().length];
for(int i=0; i<tables.length; i++) {
tables[i] = new XWPFTable(
headerFooter.getTblArray(i)
);
}
return tables;
}
/**
* Returns the textual content of the header/footer,
@ -58,11 +77,21 @@ public abstract class XWPFHeaderFooter {
*/
public String getText() {
StringBuffer t = new StringBuffer();
XWPFParagraph[] paras = getParagraphs();
for (int i = 0; i < paras.length; i++) {
t.append(paras[i].getText());
for(int i=0; i<paras.length; i++) {
if(! paras[i].isEmpty()) {
t.append(paras[i].getText());
t.append('\n');
}
}
XWPFTable[] tables = getTables();
for(int i=0; i<tables.length; i++) {
t.append(tables[i].getText());
t.append('\n');
}
return t.toString();
}
}

View File

@ -16,6 +16,8 @@
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import java.util.ArrayList;
import org.apache.poi.xwpf.XWPFDocument;
import org.apache.poi.xwpf.model.XMLParagraph;
import org.apache.xmlbeans.XmlCursor;
@ -24,6 +26,10 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
@ -43,15 +49,37 @@ public class XWPFParagraph extends XMLParagraph
public XWPFParagraph(CTP prgrph, XWPFDocument docRef)
{
super(prgrph);
this.docRef = docRef;
this.docRef = docRef;
CTR[] rs = paragraph.getRArray();
// All the runs to loop over
// TODO - replace this with some sort of XPath expression
// to directly find all the CTRs, in the right order
ArrayList<CTR> rs = new ArrayList<CTR>();
CTR[] tmp;
// Get the main text runs
tmp = paragraph.getRArray();
for(int i=0; i<tmp.length; i++) {
rs.add(tmp[i]);
}
// Not sure quite what these are, but they hold
// more text runs
CTSdtRun[] sdts = paragraph.getSdtArray();
for(int i=0; i<sdts.length; i++) {
CTSdtContentRun run = sdts[i].getSdtContent();
tmp = run.getRArray();
for(int j=0; j<tmp.length; j++) {
rs.add(tmp[j]);
}
}
// Get text of the paragraph
for (int j = 0; j < rs.length; j++) {
for (int j = 0; j < rs.size(); j++) {
// Grab the text and tabs of the paragraph
// Do so in a way that preserves the ordering
XmlCursor c = rs[j].newCursor();
XmlCursor c = rs.get(j).newCursor();
c.selectPath( "./*" );
while(c.toNextSelection()) {
XmlObject o = c.getObject();
@ -65,7 +93,7 @@ public class XWPFParagraph extends XMLParagraph
// Loop over pictures inside our
// paragraph, looking for text in them
CTPicture[] picts = rs[j].getPictArray();
CTPicture[] picts = rs.get(j).getPictArray();
for (int k = 0; k < picts.length; k++) {
XmlObject[] t = picts[k].selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
for (int m = 0; m < t.length; m++) {

View File

@ -32,24 +32,26 @@ public class XWPFTable
{
protected StringBuffer text=new StringBuffer();
public XWPFTable(CTTbl table)
{
for(CTRow row : table.getTrArray())
{
for(CTTc cell : row.getTcArray())
{
for(CTP ctp : cell.getPArray())
{
public XWPFTable(CTTbl table) {
for(CTRow row : table.getTrArray()) {
StringBuffer rowText = new StringBuffer();
for(CTTc cell : row.getTcArray()) {
for(CTP ctp : cell.getPArray()) {
XWPFParagraph p = new XWPFParagraph(ctp);
this.text.append(p.getText()+"\t");
if(rowText.length() > 0) {
rowText.append('\t');
}
rowText.append(p.getText());
}
}
this.text.append("\n");
if(rowText.length() > 0) {
this.text.append(rowText);
this.text.append('\n');
}
}
}
public String getText()
{
public String getText() {
return text.toString();
}
}

View File

@ -111,7 +111,7 @@ public class TestXWPFWordExtractor extends TestCase {
assertTrue(text.length() > 0);
char euro = '\u20ac';
// System.err.println("'"+text.substring(text.length() - 20) + "'");
// System.err.println("'"+text.substring(text.length() - 40) + "'");
// Check contents
assertTrue(text.startsWith(
@ -121,7 +121,7 @@ public class TestXWPFWordExtractor extends TestCase {
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
));
assertTrue(text.endsWith(
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n"
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n"
));
// Check number of paragraphs

View File

@ -165,7 +165,7 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
public void testContents() throws Exception {
XWPFHeaderFooterPolicy policy;
// Just test a few bits
// Test a few simple bits off a simple header
policy = diffFirst.getHeaderFooterPolicy();
assertEquals(
@ -176,5 +176,18 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
"First header column!\tMid header\tRight header!\n",
policy.getDefaultHeader().getText()
);
// And a few bits off a more complex header
policy = oddEven.getHeaderFooterPolicy();
assertEquals(
"\n[]ODD Page Header text\n\n",
policy.getDefaultHeader().getText()
);
assertEquals(
"\n[This is an Even Page, with a Header]\n\n",
policy.getEvenPageHeader().getText()
);
}
}