From d36f6623f1040fe375361cb8aa2d1fa7786e5eca Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sat, 9 Aug 2008 15:08:11 +0000 Subject: [PATCH] Have XWPFWordExtractor extract headers and footers git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684276 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/changes.xml | 1 + src/documentation/content/xdocs/status.xml | 1 + .../poi/xwpf/extractor/XWPFWordExtractor.java | 34 +++++++-- .../poi/xwpf/usermodel/XWPFHeaderFooter.java | 14 ++-- .../xwpf/extractor/TestXWPFWordExtractor.java | 71 ++++++++++++++++++- .../model/TestXWPFHeaderFooterPolicy.java | 6 +- 6 files changed, 113 insertions(+), 14 deletions(-) diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index f1dd05950..8b9af3340 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + 45539 - Improve XWPFWordExtractor to extract headers and footers Improve how XWPF handles paragraph text Support in XWPF handles headers and footers 45592 - Improve XWPF text extraction to include tables always, and picture text where possible diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 60ad6c092..62b1dc4e4 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 45539 - Improve XWPFWordExtractor to extract headers and footers Improve how XWPF handles paragraph text Support in XWPF handles headers and footers 45592 - Improve XWPF text extraction to include tables always, and picture text where possible diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 64c8e3f78..14031ebc8 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.xwpf.XWPFDocument; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; +import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator; import org.apache.poi.xwpf.model.XWPFParagraphDecorator; import org.apache.poi.xwpf.usermodel.XWPFParagraph; @@ -70,21 +71,46 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { public String getText() { StringBuffer text = new StringBuffer(); + XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); - + // Start out with all headers + // TODO - put them in where they're needed + if(hfPolicy.getFirstPageHeader() != null) { + text.append( hfPolicy.getFirstPageHeader().getText() ); + } + if(hfPolicy.getEvenPageHeader() != null) { + text.append( hfPolicy.getEvenPageHeader().getText() ); + } + if(hfPolicy.getDefaultHeader() != null) { + text.append( hfPolicy.getDefaultHeader().getText() ); + } + + // First up, all our paragraph based text Iterator i = document.getParagraphsIterator(); while(i.hasNext()) { XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); text.append(decorator.getText()+"\n"); } - + + // Then our table based text Iterator j = document.getTablesIterator(); - while(j.hasNext()) - { + while(j.hasNext()) { text.append(j.next().getText()+"\n"); } + // Finish up with all the footers + // TODO - put them in where they're needed + if(hfPolicy.getFirstPageFooter() != null) { + text.append( hfPolicy.getFirstPageFooter().getText() ); + } + if(hfPolicy.getEvenPageFooter() != null) { + text.append( hfPolicy.getEvenPageFooter().getText() ); + } + if(hfPolicy.getDefaultFooter() != null) { + text.append( hfPolicy.getDefaultFooter().getText() ); + } + return text.toString(); } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java index 36de22919..708944cf7 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java @@ -81,15 +81,21 @@ public abstract class XWPFHeaderFooter { XWPFParagraph[] paras = getParagraphs(); for(int i=0; i 0) { + t.append(text); + t.append('\n'); + } } } XWPFTable[] tables = getTables(); for(int i=0; i 0) { + t.append(text); + t.append('\n'); + } } return t.toString(); diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 1b26bb58a..8fc83bc91 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase { */ private XWPFDocument xmlB; private File fileB; + /** + * With a simplish header+footer + */ + private XWPFDocument xmlC; + private File fileC; + /** + * With different header+footer on first/rest + */ + private XWPFDocument xmlD; + private File fileD; /** * File with hyperlinks */ - private XWPFDocument xmlC; - private File fileC; + private XWPFDocument xmlE; + private File fileE; protected void setUp() throws Exception { super.setUp(); @@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase { File.separator + "IllustrativeCases.docx" ); fileC = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "ThreeColHeadFoot.docx" + ); + fileD = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "DiffFirstPageHeadFoot.docx" + ); + fileE = new File( System.getProperty("HWPF.testdata.path") + File.separator + "TestDocument.docx" ); assertTrue(fileA.exists()); assertTrue(fileB.exists()); assertTrue(fileC.exists()); + assertTrue(fileD.exists()); + assertTrue(fileE.exists()); xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString())); + xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString())); + xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString())); } /** @@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase { public void testGetWithHyperlinks() throws Exception { XWPFWordExtractor extractor = - new XWPFWordExtractor(xmlC); + new XWPFWordExtractor(xmlE); extractor.getText(); extractor.setFetchHyperlinks(true); extractor.getText(); @@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase { extractor.getText() ); } + + public void testHeadersFooters() throws Exception { + XWPFWordExtractor extractor = + new XWPFWordExtractor(xmlC); + extractor.getText(); + + assertEquals( + "First header column!\tMid header\tRight header!\n" + + "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" + + "\n" + + "HEADING TEXT\n" + + "\n" + + "More on page one\n" + + "\n\n" + + "End of page 1\n\n" + + "This is page two. It also has a three column heading, and a three column footer.\n" + + "Footer Left\tFooter Middle\tFooter Right\n", + extractor.getText() + ); + + + // Now another file, expect multiple headers + // and multiple footers + extractor = + new XWPFWordExtractor(xmlD); + extractor.getText(); + + assertEquals( + "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" + + "First header column!\tMid header\tRight header!\n" + + "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" + + "\n" + + "HEADING TEXT\n" + + "\n" + + "More on page one\n" + + "\n\n" + + "End of page 1\n\n" + + "This is page two. It also has a three column heading, and a three column footer.\n" + + "The footer of the first page\n" + + "Footer Left\tFooter Middle\tFooter Right\n", + extractor.getText() + ); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java index b2269c290..9d0e96a17 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java @@ -182,12 +182,12 @@ public class TestXWPFHeaderFooterPolicy extends TestCase { policy = oddEven.getHeaderFooterPolicy(); assertEquals( - "\n[]ODD Page Header text\n\n", + "[]ODD Page Header text\n\n", policy.getDefaultHeader().getText() ); assertEquals( - "\n[This is an Even Page, with a Header]\n\n", - policy.getEvenPageHeader().getText() + "[This is an Even Page, with a Header]\n\n", + policy.getEvenPageHeader().getText() ); } }