Have XWPFWordExtractor extract headers and footers

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684276 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-09 15:08:11 +00:00
parent 0740316520
commit d36f6623f1
6 changed files with 113 additions and 14 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.5.1-beta2" date="2008-??-??"> <release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action> <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action> <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.5.1-beta2" date="2008-??-??"> <release version="3.5.1-beta2" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action> <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action> <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>

View File

@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xwpf.XWPFDocument; import org.apache.poi.xwpf.XWPFDocument;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator; import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator; import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@ -70,8 +71,21 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
public String getText() { public String getText() {
StringBuffer text = new StringBuffer(); StringBuffer text = new StringBuffer();
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
// Start out with all headers
// TODO - put them in where they're needed
if(hfPolicy.getFirstPageHeader() != null) {
text.append( hfPolicy.getFirstPageHeader().getText() );
}
if(hfPolicy.getEvenPageHeader() != null) {
text.append( hfPolicy.getEvenPageHeader().getText() );
}
if(hfPolicy.getDefaultHeader() != null) {
text.append( hfPolicy.getDefaultHeader().getText() );
}
// First up, all our paragraph based text
Iterator<XWPFParagraph> i = document.getParagraphsIterator(); Iterator<XWPFParagraph> i = document.getParagraphsIterator();
while(i.hasNext()) { while(i.hasNext()) {
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
@ -79,12 +93,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
text.append(decorator.getText()+"\n"); text.append(decorator.getText()+"\n");
} }
// Then our table based text
Iterator<XWPFTable> j = document.getTablesIterator(); Iterator<XWPFTable> j = document.getTablesIterator();
while(j.hasNext()) while(j.hasNext()) {
{
text.append(j.next().getText()+"\n"); text.append(j.next().getText()+"\n");
} }
// Finish up with all the footers
// TODO - put them in where they're needed
if(hfPolicy.getFirstPageFooter() != null) {
text.append( hfPolicy.getFirstPageFooter().getText() );
}
if(hfPolicy.getEvenPageFooter() != null) {
text.append( hfPolicy.getEvenPageFooter().getText() );
}
if(hfPolicy.getDefaultFooter() != null) {
text.append( hfPolicy.getDefaultFooter().getText() );
}
return text.toString(); return text.toString();
} }
} }

View File

@ -81,16 +81,22 @@ public abstract class XWPFHeaderFooter {
XWPFParagraph[] paras = getParagraphs(); XWPFParagraph[] paras = getParagraphs();
for(int i=0; i<paras.length; i++) { for(int i=0; i<paras.length; i++) {
if(! paras[i].isEmpty()) { if(! paras[i].isEmpty()) {
t.append(paras[i].getText()); String text = paras[i].getText();
if(text != null && text.length() > 0) {
t.append(text);
t.append('\n'); t.append('\n');
} }
} }
}
XWPFTable[] tables = getTables(); XWPFTable[] tables = getTables();
for(int i=0; i<tables.length; i++) { for(int i=0; i<tables.length; i++) {
t.append(tables[i].getText()); String text = tables[i].getText();
if(text != null && text.length() > 0) {
t.append(text);
t.append('\n'); t.append('\n');
} }
}
return t.toString(); return t.toString();
} }

View File

@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase {
*/ */
private XWPFDocument xmlB; private XWPFDocument xmlB;
private File fileB; private File fileB;
/**
* With a simplish header+footer
*/
private XWPFDocument xmlC;
private File fileC;
/**
* With different header+footer on first/rest
*/
private XWPFDocument xmlD;
private File fileD;
/** /**
* File with hyperlinks * File with hyperlinks
*/ */
private XWPFDocument xmlC; private XWPFDocument xmlE;
private File fileC; private File fileE;
protected void setUp() throws Exception { protected void setUp() throws Exception {
super.setUp(); super.setUp();
@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase {
File.separator + "IllustrativeCases.docx" File.separator + "IllustrativeCases.docx"
); );
fileC = new File( fileC = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "ThreeColHeadFoot.docx"
);
fileD = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "DiffFirstPageHeadFoot.docx"
);
fileE = new File(
System.getProperty("HWPF.testdata.path") + System.getProperty("HWPF.testdata.path") +
File.separator + "TestDocument.docx" File.separator + "TestDocument.docx"
); );
assertTrue(fileA.exists()); assertTrue(fileA.exists());
assertTrue(fileB.exists()); assertTrue(fileB.exists());
assertTrue(fileC.exists()); assertTrue(fileC.exists());
assertTrue(fileD.exists());
assertTrue(fileE.exists());
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString())); xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
} }
/** /**
@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase {
public void testGetWithHyperlinks() throws Exception { public void testGetWithHyperlinks() throws Exception {
XWPFWordExtractor extractor = XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC); new XWPFWordExtractor(xmlE);
extractor.getText(); extractor.getText();
extractor.setFetchHyperlinks(true); extractor.setFetchHyperlinks(true);
extractor.getText(); extractor.getText();
@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase {
extractor.getText() extractor.getText()
); );
} }
public void testHeadersFooters() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC);
extractor.getText();
assertEquals(
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
// Now another file, expect multiple headers
// and multiple footers
extractor =
new XWPFWordExtractor(xmlD);
extractor.getText();
assertEquals(
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
"First header column!\tMid header\tRight header!\n" +
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
"\n" +
"HEADING TEXT\n" +
"\n" +
"More on page one\n" +
"\n\n" +
"End of page 1\n\n" +
"This is page two. It also has a three column heading, and a three column footer.\n" +
"The footer of the first page\n" +
"Footer Left\tFooter Middle\tFooter Right\n",
extractor.getText()
);
}
} }

View File

@ -182,11 +182,11 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
policy = oddEven.getHeaderFooterPolicy(); policy = oddEven.getHeaderFooterPolicy();
assertEquals( assertEquals(
"\n[]ODD Page Header text\n\n", "[]ODD Page Header text\n\n",
policy.getDefaultHeader().getText() policy.getDefaultHeader().getText()
); );
assertEquals( assertEquals(
"\n[This is an Even Page, with a Header]\n\n", "[This is an Even Page, with a Header]\n\n",
policy.getEvenPageHeader().getText() policy.getEvenPageHeader().getText()
); );
} }