Add header/footer support to HWPF WordExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684362 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
24131c58f5
commit
09fb97524b
@ -37,6 +37,9 @@
|
|||||||
|
|
||||||
<!-- Don't forget to update status.xml too! -->
|
<!-- Don't forget to update status.xml too! -->
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
|
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
|
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
|
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
|
||||||
|
@ -34,6 +34,9 @@
|
|||||||
<!-- Don't forget to update changes.xml too! -->
|
<!-- Don't forget to update changes.xml too! -->
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
|
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
|
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
|
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
|
||||||
|
@ -25,6 +25,7 @@ import java.util.Iterator;
|
|||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
import org.apache.poi.hwpf.model.TextPiece;
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
|
import org.apache.poi.hwpf.usermodel.HeaderStories;
|
||||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
import org.apache.poi.hwpf.usermodel.Range;
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
@ -115,6 +116,65 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the header/footer text, if it's not empty
|
||||||
|
*/
|
||||||
|
private void appendHeaderFooter(String text, StringBuffer out) {
|
||||||
|
if(text == null || text.length() == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
text = text.replace('\r', '\n');
|
||||||
|
if(! text.endsWith("\n")) {
|
||||||
|
out.append(text);
|
||||||
|
out.append('\n');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if(text.endsWith("\n\n")) {
|
||||||
|
out.append(text.substring(0, text.length()-1));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out.append(text);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Grab the text from the headers
|
||||||
|
*/
|
||||||
|
public String getHeaderText() {
|
||||||
|
HeaderStories hs = new HeaderStories(doc);
|
||||||
|
|
||||||
|
StringBuffer ret = new StringBuffer();
|
||||||
|
if(hs.getFirstHeader() != null) {
|
||||||
|
appendHeaderFooter(hs.getFirstHeader(), ret);
|
||||||
|
}
|
||||||
|
if(hs.getEvenHeader() != null) {
|
||||||
|
appendHeaderFooter(hs.getEvenHeader(), ret);
|
||||||
|
}
|
||||||
|
if(hs.getOddHeader() != null) {
|
||||||
|
appendHeaderFooter(hs.getOddHeader(), ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Grab the text from the footers
|
||||||
|
*/
|
||||||
|
public String getFooterText() {
|
||||||
|
HeaderStories hs = new HeaderStories(doc);
|
||||||
|
|
||||||
|
StringBuffer ret = new StringBuffer();
|
||||||
|
if(hs.getFirstFooter() != null) {
|
||||||
|
appendHeaderFooter(hs.getFirstFooter(), ret);
|
||||||
|
}
|
||||||
|
if(hs.getEvenFooter() != null) {
|
||||||
|
appendHeaderFooter(hs.getEvenFooter(), ret);
|
||||||
|
}
|
||||||
|
if(hs.getOddFooter() != null) {
|
||||||
|
appendHeaderFooter(hs.getOddFooter(), ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Grab the text out of the text pieces. Might also include various
|
* Grab the text out of the text pieces. Might also include various
|
||||||
* bits of crud, but will work in cases where the text piece -> paragraph
|
* bits of crud, but will work in cases where the text piece -> paragraph
|
||||||
@ -158,10 +218,16 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
|||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer ret = new StringBuffer();
|
StringBuffer ret = new StringBuffer();
|
||||||
|
|
||||||
|
ret.append(getHeaderText());
|
||||||
|
|
||||||
String[] text = getParagraphText();
|
String[] text = getParagraphText();
|
||||||
for(int i=0; i<text.length; i++) {
|
for(int i=0; i<text.length; i++) {
|
||||||
ret.append(text[i]);
|
ret.append(text[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret.append(getFooterText());
|
||||||
|
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,6 +55,11 @@ public class TestWordExtractor extends TestCase {
|
|||||||
// A word doc embeded in an excel file
|
// A word doc embeded in an excel file
|
||||||
private String filename3;
|
private String filename3;
|
||||||
|
|
||||||
|
// With header and footer
|
||||||
|
private String filename4;
|
||||||
|
// With unicode header and footer
|
||||||
|
private String filename5;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
String dirname = System.getProperty("HWPF.testdata.path");
|
String dirname = System.getProperty("HWPF.testdata.path");
|
||||||
String pdirname = System.getProperty("POIFS.testdata.path");
|
String pdirname = System.getProperty("POIFS.testdata.path");
|
||||||
@ -62,6 +67,9 @@ public class TestWordExtractor extends TestCase {
|
|||||||
String filename = dirname + "/test2.doc";
|
String filename = dirname + "/test2.doc";
|
||||||
String filename2 = dirname + "/test.doc";
|
String filename2 = dirname + "/test.doc";
|
||||||
filename3 = pdirname + "/excel_with_embeded.xls";
|
filename3 = pdirname + "/excel_with_embeded.xls";
|
||||||
|
filename4 = dirname + "/ThreeColHeadFoot.doc";
|
||||||
|
filename5 = dirname + "/HeaderFooterUnicode.doc";
|
||||||
|
|
||||||
extractor = new WordExtractor(new FileInputStream(filename));
|
extractor = new WordExtractor(new FileInputStream(filename));
|
||||||
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
||||||
|
|
||||||
@ -149,4 +157,72 @@ public class TestWordExtractor extends TestCase {
|
|||||||
assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
|
assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
|
||||||
assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
|
assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testWithHeader() throws Exception {
|
||||||
|
// Non-unicode
|
||||||
|
HWPFDocument doc = new HWPFDocument(
|
||||||
|
new FileInputStream(filename4)
|
||||||
|
);
|
||||||
|
extractor = new WordExtractor(doc);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"First header column!\tMid header Right header!\n",
|
||||||
|
extractor.getHeaderText()
|
||||||
|
);
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(
|
||||||
|
text.indexOf("First header column!") > -1
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Unicode
|
||||||
|
doc = new HWPFDocument(
|
||||||
|
new FileInputStream(filename5)
|
||||||
|
);
|
||||||
|
extractor = new WordExtractor(doc);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
|
||||||
|
extractor.getHeaderText()
|
||||||
|
);
|
||||||
|
text = extractor.getText();
|
||||||
|
assertTrue(
|
||||||
|
text.indexOf("This is a simple header") > -1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWithFooter() throws Exception {
|
||||||
|
// Non-unicode
|
||||||
|
HWPFDocument doc = new HWPFDocument(
|
||||||
|
new FileInputStream(filename4)
|
||||||
|
);
|
||||||
|
extractor = new WordExtractor(doc);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"Footer Left\tFooter Middle Footer Right\n",
|
||||||
|
extractor.getFooterText()
|
||||||
|
);
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(
|
||||||
|
text.indexOf("Footer Left") > -1
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Unicode
|
||||||
|
doc = new HWPFDocument(
|
||||||
|
new FileInputStream(filename5)
|
||||||
|
);
|
||||||
|
extractor = new WordExtractor(doc);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
|
||||||
|
extractor.getFooterText()
|
||||||
|
);
|
||||||
|
text = extractor.getText();
|
||||||
|
assertTrue(
|
||||||
|
text.indexOf("The footer, with") > -1
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user