Have XWPFWordExtractor extract headers and footers
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684276 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0740316520
commit
d36f6623f1
@ -37,6 +37,7 @@
|
|||||||
|
|
||||||
<!-- Don't forget to update status.xml too! -->
|
<!-- Don't forget to update status.xml too! -->
|
||||||
<release version="3.5.1-beta2" date="2008-??-??">
|
<release version="3.5.1-beta2" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||||
|
@ -34,6 +34,7 @@
|
|||||||
<!-- Don't forget to update changes.xml too! -->
|
<!-- Don't forget to update changes.xml too! -->
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.5.1-beta2" date="2008-??-??">
|
<release version="3.5.1-beta2" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
<action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
|
||||||
|
@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument;
|
|||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.xwpf.XWPFDocument;
|
import org.apache.poi.xwpf.XWPFDocument;
|
||||||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||||
|
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||||
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
|
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
|
||||||
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
|
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||||
@ -70,8 +71,21 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
|
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
||||||
|
|
||||||
|
// Start out with all headers
|
||||||
|
// TODO - put them in where they're needed
|
||||||
|
if(hfPolicy.getFirstPageHeader() != null) {
|
||||||
|
text.append( hfPolicy.getFirstPageHeader().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getEvenPageHeader() != null) {
|
||||||
|
text.append( hfPolicy.getEvenPageHeader().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getDefaultHeader() != null) {
|
||||||
|
text.append( hfPolicy.getDefaultHeader().getText() );
|
||||||
|
}
|
||||||
|
|
||||||
|
// First up, all our paragraph based text
|
||||||
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
||||||
while(i.hasNext()) {
|
while(i.hasNext()) {
|
||||||
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
||||||
@ -79,12 +93,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||||||
text.append(decorator.getText()+"\n");
|
text.append(decorator.getText()+"\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Then our table based text
|
||||||
Iterator<XWPFTable> j = document.getTablesIterator();
|
Iterator<XWPFTable> j = document.getTablesIterator();
|
||||||
while(j.hasNext())
|
while(j.hasNext()) {
|
||||||
{
|
|
||||||
text.append(j.next().getText()+"\n");
|
text.append(j.next().getText()+"\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Finish up with all the footers
|
||||||
|
// TODO - put them in where they're needed
|
||||||
|
if(hfPolicy.getFirstPageFooter() != null) {
|
||||||
|
text.append( hfPolicy.getFirstPageFooter().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getEvenPageFooter() != null) {
|
||||||
|
text.append( hfPolicy.getEvenPageFooter().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getDefaultFooter() != null) {
|
||||||
|
text.append( hfPolicy.getDefaultFooter().getText() );
|
||||||
|
}
|
||||||
|
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -81,16 +81,22 @@ public abstract class XWPFHeaderFooter {
|
|||||||
XWPFParagraph[] paras = getParagraphs();
|
XWPFParagraph[] paras = getParagraphs();
|
||||||
for(int i=0; i<paras.length; i++) {
|
for(int i=0; i<paras.length; i++) {
|
||||||
if(! paras[i].isEmpty()) {
|
if(! paras[i].isEmpty()) {
|
||||||
t.append(paras[i].getText());
|
String text = paras[i].getText();
|
||||||
|
if(text != null && text.length() > 0) {
|
||||||
|
t.append(text);
|
||||||
t.append('\n');
|
t.append('\n');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
XWPFTable[] tables = getTables();
|
XWPFTable[] tables = getTables();
|
||||||
for(int i=0; i<tables.length; i++) {
|
for(int i=0; i<tables.length; i++) {
|
||||||
t.append(tables[i].getText());
|
String text = tables[i].getText();
|
||||||
|
if(text != null && text.length() > 0) {
|
||||||
|
t.append(text);
|
||||||
t.append('\n');
|
t.append('\n');
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return t.toString();
|
return t.toString();
|
||||||
}
|
}
|
||||||
|
@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
*/
|
*/
|
||||||
private XWPFDocument xmlB;
|
private XWPFDocument xmlB;
|
||||||
private File fileB;
|
private File fileB;
|
||||||
|
/**
|
||||||
|
* With a simplish header+footer
|
||||||
|
*/
|
||||||
|
private XWPFDocument xmlC;
|
||||||
|
private File fileC;
|
||||||
|
/**
|
||||||
|
* With different header+footer on first/rest
|
||||||
|
*/
|
||||||
|
private XWPFDocument xmlD;
|
||||||
|
private File fileD;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* File with hyperlinks
|
* File with hyperlinks
|
||||||
*/
|
*/
|
||||||
private XWPFDocument xmlC;
|
private XWPFDocument xmlE;
|
||||||
private File fileC;
|
private File fileE;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
File.separator + "IllustrativeCases.docx"
|
File.separator + "IllustrativeCases.docx"
|
||||||
);
|
);
|
||||||
fileC = new File(
|
fileC = new File(
|
||||||
|
System.getProperty("HWPF.testdata.path") +
|
||||||
|
File.separator + "ThreeColHeadFoot.docx"
|
||||||
|
);
|
||||||
|
fileD = new File(
|
||||||
|
System.getProperty("HWPF.testdata.path") +
|
||||||
|
File.separator + "DiffFirstPageHeadFoot.docx"
|
||||||
|
);
|
||||||
|
fileE = new File(
|
||||||
System.getProperty("HWPF.testdata.path") +
|
System.getProperty("HWPF.testdata.path") +
|
||||||
File.separator + "TestDocument.docx"
|
File.separator + "TestDocument.docx"
|
||||||
);
|
);
|
||||||
assertTrue(fileA.exists());
|
assertTrue(fileA.exists());
|
||||||
assertTrue(fileB.exists());
|
assertTrue(fileB.exists());
|
||||||
assertTrue(fileC.exists());
|
assertTrue(fileC.exists());
|
||||||
|
assertTrue(fileD.exists());
|
||||||
|
assertTrue(fileE.exists());
|
||||||
|
|
||||||
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
|
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
|
||||||
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
|
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
|
||||||
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
|
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
|
||||||
|
xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
|
||||||
|
xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
|
|
||||||
public void testGetWithHyperlinks() throws Exception {
|
public void testGetWithHyperlinks() throws Exception {
|
||||||
XWPFWordExtractor extractor =
|
XWPFWordExtractor extractor =
|
||||||
new XWPFWordExtractor(xmlC);
|
new XWPFWordExtractor(xmlE);
|
||||||
extractor.getText();
|
extractor.getText();
|
||||||
extractor.setFetchHyperlinks(true);
|
extractor.setFetchHyperlinks(true);
|
||||||
extractor.getText();
|
extractor.getText();
|
||||||
@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
extractor.getText()
|
extractor.getText()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testHeadersFooters() throws Exception {
|
||||||
|
XWPFWordExtractor extractor =
|
||||||
|
new XWPFWordExtractor(xmlC);
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"First header column!\tMid header\tRight header!\n" +
|
||||||
|
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
|
||||||
|
"\n" +
|
||||||
|
"HEADING TEXT\n" +
|
||||||
|
"\n" +
|
||||||
|
"More on page one\n" +
|
||||||
|
"\n\n" +
|
||||||
|
"End of page 1\n\n" +
|
||||||
|
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
||||||
|
"Footer Left\tFooter Middle\tFooter Right\n",
|
||||||
|
extractor.getText()
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Now another file, expect multiple headers
|
||||||
|
// and multiple footers
|
||||||
|
extractor =
|
||||||
|
new XWPFWordExtractor(xmlD);
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
|
||||||
|
"First header column!\tMid header\tRight header!\n" +
|
||||||
|
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
|
||||||
|
"\n" +
|
||||||
|
"HEADING TEXT\n" +
|
||||||
|
"\n" +
|
||||||
|
"More on page one\n" +
|
||||||
|
"\n\n" +
|
||||||
|
"End of page 1\n\n" +
|
||||||
|
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
||||||
|
"The footer of the first page\n" +
|
||||||
|
"Footer Left\tFooter Middle\tFooter Right\n",
|
||||||
|
extractor.getText()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -182,11 +182,11 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
|
|||||||
policy = oddEven.getHeaderFooterPolicy();
|
policy = oddEven.getHeaderFooterPolicy();
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"\n[]ODD Page Header text\n\n",
|
"[]ODD Page Header text\n\n",
|
||||||
policy.getDefaultHeader().getText()
|
policy.getDefaultHeader().getText()
|
||||||
);
|
);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"\n[This is an Even Page, with a Header]\n\n",
|
"[This is an Even Page, with a Header]\n\n",
|
||||||
policy.getEvenPageHeader().getText()
|
policy.getEvenPageHeader().getText()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user