diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index a27f2971d..019186771 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,7 +33,8 @@ - 47773 - Support for extraction of header / footer images in HWPF + 47773 - Fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor + 47727 - Support for extraction of header / footer images in HWPF moved all test data to a top-level directory 47721 - Added implementation for INDIRECT() 45583 - Avoid exception when reading ClipboardData packet in OLE property sets diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 63059653f..2c604b60e 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -21,6 +21,7 @@ import java.util.Iterator; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.POIXMLException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; @@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.xmlbeans.XmlException; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; /** * Helper class to extract text from an OOXML Word file @@ -72,45 +74,77 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { public String getText() { StringBuffer text = new StringBuffer(); XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); - + // Start out with all headers - // TODO - put them in where they're needed - if(hfPolicy.getFirstPageHeader() != null) { - text.append( hfPolicy.getFirstPageHeader().getText() ); - } - if(hfPolicy.getEvenPageHeader() != null) { - text.append( hfPolicy.getEvenPageHeader().getText() ); - } - if(hfPolicy.getDefaultHeader() != null) { - text.append( hfPolicy.getDefaultHeader().getText() ); - } + extractHeaders(text, hfPolicy); // First up, all our paragraph based text Iterator i = document.getParagraphsIterator(); while(i.hasNext()) { - XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( - new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); - text.append(decorator.getText()+"\n"); - } + XWPFParagraph paragraph = i.next(); + + + try { + CTSectPr ctSectPr = null; + if (paragraph.getCTP().getPPr()!=null) { + ctSectPr = paragraph.getCTP().getPPr().getSectPr(); + } + + XWPFHeaderFooterPolicy headerFooterPolicy = null; + + if (ctSectPr!=null) { + headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); + + extractHeaders(text, headerFooterPolicy); + } + + XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( + new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); + text.append(decorator.getText()).append('\n'); + + if (ctSectPr!=null) { + extractFooters(text, headerFooterPolicy); + } + } catch (IOException e) { + throw new POIXMLException(e); + } catch (XmlException e) { + throw new POIXMLException(e); + } + } // Then our table based text Iterator j = document.getTablesIterator(); while(j.hasNext()) { - text.append(j.next().getText()+"\n"); + text.append(j.next().getText()).append('\n'); } // Finish up with all the footers - // TODO - put them in where they're needed - if(hfPolicy.getFirstPageFooter() != null) { - text.append( hfPolicy.getFirstPageFooter().getText() ); - } - if(hfPolicy.getEvenPageFooter() != null) { - text.append( hfPolicy.getEvenPageFooter().getText() ); - } - if(hfPolicy.getDefaultFooter() != null) { - text.append( hfPolicy.getDefaultFooter().getText() ); - } + extractFooters(text, hfPolicy); return text.toString(); } + + private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if(hfPolicy.getFirstPageFooter() != null) { + text.append( hfPolicy.getFirstPageFooter().getText() ); + } + if(hfPolicy.getEvenPageFooter() != null) { + text.append( hfPolicy.getEvenPageFooter().getText() ); + } + if(hfPolicy.getDefaultFooter() != null) { + text.append( hfPolicy.getDefaultFooter().getText() ); + } + } + + private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if(hfPolicy.getFirstPageHeader() != null) { + text.append( hfPolicy.getFirstPageHeader().getText() ); + } + if(hfPolicy.getEvenPageHeader() != null) { + text.append( hfPolicy.getEvenPageHeader().getText() ); + } + if(hfPolicy.getDefaultHeader() != null) { + text.append( hfPolicy.getDefaultHeader().getText() ); + } + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java index b6bc5f15e..a0ffb87bf 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java +++ b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java @@ -83,19 +83,26 @@ public class XWPFHeaderFooterPolicy { private XWPFHeader defaultHeader; private XWPFFooter defaultFooter; - + /** + * Figures out the policy for the given document, + * and creates any header and footer objects + * as required. + */ + public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { + this(doc, doc.getDocument().getBody().getSectPr()); + } + /** * Figures out the policy for the given document, * and creates any header and footer objects * as required. */ - public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { + public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException { // Grab what headers and footers have been defined // For now, we don't care about different ranges, as it // doesn't seem that .docx properly supports that // feature of the file format yet this.doc = doc; - CTSectPr sectPr = doc.getDocument().getBody().getSectPr(); for(int i=0; i