fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor, see Bugzilla 47727

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@809662 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-08-31 17:02:06 +00:00
parent f4f823b94c
commit 20ea9dce10
5 changed files with 81 additions and 30 deletions

View File

@ -33,7 +33,8 @@
<changes> <changes>
<release version="3.5-beta7" date="2009-??-??"> <release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="fix">47773 - Support for extraction of header / footer images in HWPF</action> <action dev="POI-DEVELOPERS" type="fix">47773 - Fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">47727 - Support for extraction of header / footer images in HWPF</action>
<action dev="POI-DEVELOPERS" type="fix">moved all test data to a top-level directory</action> <action dev="POI-DEVELOPERS" type="fix">moved all test data to a top-level directory</action>
<action dev="POI-DEVELOPERS" type="add">47721 - Added implementation for INDIRECT()</action> <action dev="POI-DEVELOPERS" type="add">47721 - Added implementation for INDIRECT()</action>
<action dev="POI-DEVELOPERS" type="add">45583 - Avoid exception when reading ClipboardData packet in OLE property sets</action> <action dev="POI-DEVELOPERS" type="add">45583 - Avoid exception when reading ClipboardData packet in OLE property sets</action>

View File

@ -21,6 +21,7 @@ import java.util.Iterator;
import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.POIXMLException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
/** /**
* Helper class to extract text from an OOXML Word file * Helper class to extract text from an OOXML Word file
@ -74,43 +76,75 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
// Start out with all headers // Start out with all headers
// TODO - put them in where they're needed extractHeaders(text, hfPolicy);
if(hfPolicy.getFirstPageHeader() != null) {
text.append( hfPolicy.getFirstPageHeader().getText() );
}
if(hfPolicy.getEvenPageHeader() != null) {
text.append( hfPolicy.getEvenPageHeader().getText() );
}
if(hfPolicy.getDefaultHeader() != null) {
text.append( hfPolicy.getDefaultHeader().getText() );
}
// First up, all our paragraph based text // First up, all our paragraph based text
Iterator<XWPFParagraph> i = document.getParagraphsIterator(); Iterator<XWPFParagraph> i = document.getParagraphsIterator();
while(i.hasNext()) { while(i.hasNext()) {
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( XWPFParagraph paragraph = i.next();
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
text.append(decorator.getText()+"\n");
} try {
CTSectPr ctSectPr = null;
if (paragraph.getCTP().getPPr()!=null) {
ctSectPr = paragraph.getCTP().getPPr().getSectPr();
}
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (ctSectPr!=null) {
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(text, headerFooterPolicy);
}
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
text.append(decorator.getText()).append('\n');
if (ctSectPr!=null) {
extractFooters(text, headerFooterPolicy);
}
} catch (IOException e) {
throw new POIXMLException(e);
} catch (XmlException e) {
throw new POIXMLException(e);
}
}
// Then our table based text // Then our table based text
Iterator<XWPFTable> j = document.getTablesIterator(); Iterator<XWPFTable> j = document.getTablesIterator();
while(j.hasNext()) { while(j.hasNext()) {
text.append(j.next().getText()+"\n"); text.append(j.next().getText()).append('\n');
} }
// Finish up with all the footers // Finish up with all the footers
// TODO - put them in where they're needed extractFooters(text, hfPolicy);
if(hfPolicy.getFirstPageFooter() != null) {
text.append( hfPolicy.getFirstPageFooter().getText() );
}
if(hfPolicy.getEvenPageFooter() != null) {
text.append( hfPolicy.getEvenPageFooter().getText() );
}
if(hfPolicy.getDefaultFooter() != null) {
text.append( hfPolicy.getDefaultFooter().getText() );
}
return text.toString(); return text.toString();
} }
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
if(hfPolicy.getFirstPageFooter() != null) {
text.append( hfPolicy.getFirstPageFooter().getText() );
}
if(hfPolicy.getEvenPageFooter() != null) {
text.append( hfPolicy.getEvenPageFooter().getText() );
}
if(hfPolicy.getDefaultFooter() != null) {
text.append( hfPolicy.getDefaultFooter().getText() );
}
}
private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
if(hfPolicy.getFirstPageHeader() != null) {
text.append( hfPolicy.getFirstPageHeader().getText() );
}
if(hfPolicy.getEvenPageHeader() != null) {
text.append( hfPolicy.getEvenPageHeader().getText() );
}
if(hfPolicy.getDefaultHeader() != null) {
text.append( hfPolicy.getDefaultHeader().getText() );
}
}
} }

View File

@ -83,19 +83,26 @@ public class XWPFHeaderFooterPolicy {
private XWPFHeader defaultHeader; private XWPFHeader defaultHeader;
private XWPFFooter defaultFooter; private XWPFFooter defaultFooter;
/**
* Figures out the policy for the given document,
* and creates any header and footer objects
* as required.
*/
public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
this(doc, doc.getDocument().getBody().getSectPr());
}
/** /**
* Figures out the policy for the given document, * Figures out the policy for the given document,
* and creates any header and footer objects * and creates any header and footer objects
* as required. * as required.
*/ */
public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException {
// Grab what headers and footers have been defined // Grab what headers and footers have been defined
// For now, we don't care about different ranges, as it // For now, we don't care about different ranges, as it
// doesn't seem that .docx properly supports that // doesn't seem that .docx properly supports that
// feature of the file format yet // feature of the file format yet
this.doc = doc; this.doc = doc;
CTSectPr sectPr = doc.getDocument().getBody().getSectPr();
for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) { for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
// Get the header // Get the header
CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i); CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);

View File

@ -198,4 +198,13 @@ public class TestXWPFWordExtractor extends TestCase {
assertTrue(extractor.getText().contains("extremely well")); assertTrue(extractor.getText().contains("extremely well"));
} }
public void testParagraphHeader() {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
assertTrue(extractor.getText().contains("Section 1"));
assertTrue(extractor.getText().contains("Section 2"));
assertTrue(extractor.getText().contains("Section 3"));
}
} }

BIN
test-data/document/Headers.docx Executable file

Binary file not shown.