Have XWPFWordExtractor extract headers and footers
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684276 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0740316520
commit
d36f6623f1
@ -37,6 +37,7 @@
|
||||
|
||||
<!-- Don't forget to update status.xml too! -->
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||
|
@ -34,6 +34,7 @@
|
||||
<!-- Don't forget to update changes.xml too! -->
|
||||
<changes>
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||
|
@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.xwpf.XWPFDocument;
|
||||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
|
||||
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
@ -70,21 +71,46 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
||||
|
||||
|
||||
// Start out with all headers
|
||||
// TODO - put them in where they're needed
|
||||
if(hfPolicy.getFirstPageHeader() != null) {
|
||||
text.append( hfPolicy.getFirstPageHeader().getText() );
|
||||
}
|
||||
if(hfPolicy.getEvenPageHeader() != null) {
|
||||
text.append( hfPolicy.getEvenPageHeader().getText() );
|
||||
}
|
||||
if(hfPolicy.getDefaultHeader() != null) {
|
||||
text.append( hfPolicy.getDefaultHeader().getText() );
|
||||
}
|
||||
|
||||
// First up, all our paragraph based text
|
||||
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
||||
while(i.hasNext()) {
|
||||
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
||||
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
|
||||
text.append(decorator.getText()+"\n");
|
||||
}
|
||||
|
||||
|
||||
// Then our table based text
|
||||
Iterator<XWPFTable> j = document.getTablesIterator();
|
||||
while(j.hasNext())
|
||||
{
|
||||
while(j.hasNext()) {
|
||||
text.append(j.next().getText()+"\n");
|
||||
}
|
||||
|
||||
// Finish up with all the footers
|
||||
// TODO - put them in where they're needed
|
||||
if(hfPolicy.getFirstPageFooter() != null) {
|
||||
text.append( hfPolicy.getFirstPageFooter().getText() );
|
||||
}
|
||||
if(hfPolicy.getEvenPageFooter() != null) {
|
||||
text.append( hfPolicy.getEvenPageFooter().getText() );
|
||||
}
|
||||
if(hfPolicy.getDefaultFooter() != null) {
|
||||
text.append( hfPolicy.getDefaultFooter().getText() );
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
@ -81,15 +81,21 @@ public abstract class XWPFHeaderFooter {
|
||||
XWPFParagraph[] paras = getParagraphs();
|
||||
for(int i=0; i<paras.length; i++) {
|
||||
if(! paras[i].isEmpty()) {
|
||||
t.append(paras[i].getText());
|
||||
t.append('\n');
|
||||
String text = paras[i].getText();
|
||||
if(text != null && text.length() > 0) {
|
||||
t.append(text);
|
||||
t.append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XWPFTable[] tables = getTables();
|
||||
for(int i=0; i<tables.length; i++) {
|
||||
t.append(tables[i].getText());
|
||||
t.append('\n');
|
||||
String text = tables[i].getText();
|
||||
if(text != null && text.length() > 0) {
|
||||
t.append(text);
|
||||
t.append('\n');
|
||||
}
|
||||
}
|
||||
|
||||
return t.toString();
|
||||
|
@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
*/
|
||||
private XWPFDocument xmlB;
|
||||
private File fileB;
|
||||
/**
|
||||
* With a simplish header+footer
|
||||
*/
|
||||
private XWPFDocument xmlC;
|
||||
private File fileC;
|
||||
/**
|
||||
* With different header+footer on first/rest
|
||||
*/
|
||||
private XWPFDocument xmlD;
|
||||
private File fileD;
|
||||
|
||||
/**
|
||||
* File with hyperlinks
|
||||
*/
|
||||
private XWPFDocument xmlC;
|
||||
private File fileC;
|
||||
private XWPFDocument xmlE;
|
||||
private File fileE;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
File.separator + "IllustrativeCases.docx"
|
||||
);
|
||||
fileC = new File(
|
||||
System.getProperty("HWPF.testdata.path") +
|
||||
File.separator + "ThreeColHeadFoot.docx"
|
||||
);
|
||||
fileD = new File(
|
||||
System.getProperty("HWPF.testdata.path") +
|
||||
File.separator + "DiffFirstPageHeadFoot.docx"
|
||||
);
|
||||
fileE = new File(
|
||||
System.getProperty("HWPF.testdata.path") +
|
||||
File.separator + "TestDocument.docx"
|
||||
);
|
||||
assertTrue(fileA.exists());
|
||||
assertTrue(fileB.exists());
|
||||
assertTrue(fileC.exists());
|
||||
assertTrue(fileD.exists());
|
||||
assertTrue(fileE.exists());
|
||||
|
||||
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
|
||||
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
|
||||
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
|
||||
xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
|
||||
xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
|
||||
public void testGetWithHyperlinks() throws Exception {
|
||||
XWPFWordExtractor extractor =
|
||||
new XWPFWordExtractor(xmlC);
|
||||
new XWPFWordExtractor(xmlE);
|
||||
extractor.getText();
|
||||
extractor.setFetchHyperlinks(true);
|
||||
extractor.getText();
|
||||
@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||
extractor.getText()
|
||||
);
|
||||
}
|
||||
|
||||
public void testHeadersFooters() throws Exception {
|
||||
XWPFWordExtractor extractor =
|
||||
new XWPFWordExtractor(xmlC);
|
||||
extractor.getText();
|
||||
|
||||
assertEquals(
|
||||
"First header column!\tMid header\tRight header!\n" +
|
||||
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
|
||||
"\n" +
|
||||
"HEADING TEXT\n" +
|
||||
"\n" +
|
||||
"More on page one\n" +
|
||||
"\n\n" +
|
||||
"End of page 1\n\n" +
|
||||
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
||||
"Footer Left\tFooter Middle\tFooter Right\n",
|
||||
extractor.getText()
|
||||
);
|
||||
|
||||
|
||||
// Now another file, expect multiple headers
|
||||
// and multiple footers
|
||||
extractor =
|
||||
new XWPFWordExtractor(xmlD);
|
||||
extractor.getText();
|
||||
|
||||
assertEquals(
|
||||
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
|
||||
"First header column!\tMid header\tRight header!\n" +
|
||||
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
|
||||
"\n" +
|
||||
"HEADING TEXT\n" +
|
||||
"\n" +
|
||||
"More on page one\n" +
|
||||
"\n\n" +
|
||||
"End of page 1\n\n" +
|
||||
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
||||
"The footer of the first page\n" +
|
||||
"Footer Left\tFooter Middle\tFooter Right\n",
|
||||
extractor.getText()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -182,12 +182,12 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
|
||||
policy = oddEven.getHeaderFooterPolicy();
|
||||
|
||||
assertEquals(
|
||||
"\n[]ODD Page Header text\n\n",
|
||||
"[]ODD Page Header text\n\n",
|
||||
policy.getDefaultHeader().getText()
|
||||
);
|
||||
assertEquals(
|
||||
"\n[This is an Even Page, with a Header]\n\n",
|
||||
policy.getEvenPageHeader().getText()
|
||||
"[This is an Even Page, with a Header]\n\n",
|
||||
policy.getEvenPageHeader().getText()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user