fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor, see Bugzilla 47727
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@809662 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f4f823b94c
commit
20ea9dce10
@ -33,7 +33,8 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.5-beta7" date="2009-??-??">
|
<release version="3.5-beta7" date="2009-??-??">
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47773 - Support for extraction of header / footer images in HWPF</action>
|
<action dev="POI-DEVELOPERS" type="fix">47773 - Fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor</action>
|
||||||
|
<action dev="POI-DEVELOPERS" type="fix">47727 - Support for extraction of header / footer images in HWPF</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">moved all test data to a top-level directory</action>
|
<action dev="POI-DEVELOPERS" type="fix">moved all test data to a top-level directory</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">47721 - Added implementation for INDIRECT()</action>
|
<action dev="POI-DEVELOPERS" type="add">47721 - Added implementation for INDIRECT()</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">45583 - Avoid exception when reading ClipboardData packet in OLE property sets</action>
|
<action dev="POI-DEVELOPERS" type="add">45583 - Avoid exception when reading ClipboardData packet in OLE property sets</action>
|
||||||
|
@ -21,6 +21,7 @@ import java.util.Iterator;
|
|||||||
|
|
||||||
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLDocument;
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
|
import org.apache.poi.POIXMLException;
|
||||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||||
@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|||||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper class to extract text from an OOXML Word file
|
* Helper class to extract text from an OOXML Word file
|
||||||
@ -74,33 +76,55 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||||||
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
||||||
|
|
||||||
// Start out with all headers
|
// Start out with all headers
|
||||||
// TODO - put them in where they're needed
|
extractHeaders(text, hfPolicy);
|
||||||
if(hfPolicy.getFirstPageHeader() != null) {
|
|
||||||
text.append( hfPolicy.getFirstPageHeader().getText() );
|
|
||||||
}
|
|
||||||
if(hfPolicy.getEvenPageHeader() != null) {
|
|
||||||
text.append( hfPolicy.getEvenPageHeader().getText() );
|
|
||||||
}
|
|
||||||
if(hfPolicy.getDefaultHeader() != null) {
|
|
||||||
text.append( hfPolicy.getDefaultHeader().getText() );
|
|
||||||
}
|
|
||||||
|
|
||||||
// First up, all our paragraph based text
|
// First up, all our paragraph based text
|
||||||
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
||||||
while(i.hasNext()) {
|
while(i.hasNext()) {
|
||||||
|
XWPFParagraph paragraph = i.next();
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
CTSectPr ctSectPr = null;
|
||||||
|
if (paragraph.getCTP().getPPr()!=null) {
|
||||||
|
ctSectPr = paragraph.getCTP().getPPr().getSectPr();
|
||||||
|
}
|
||||||
|
|
||||||
|
XWPFHeaderFooterPolicy headerFooterPolicy = null;
|
||||||
|
|
||||||
|
if (ctSectPr!=null) {
|
||||||
|
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
|
||||||
|
|
||||||
|
extractHeaders(text, headerFooterPolicy);
|
||||||
|
}
|
||||||
|
|
||||||
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
||||||
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
|
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
|
||||||
text.append(decorator.getText()+"\n");
|
text.append(decorator.getText()).append('\n');
|
||||||
|
|
||||||
|
if (ctSectPr!=null) {
|
||||||
|
extractFooters(text, headerFooterPolicy);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new POIXMLException(e);
|
||||||
|
} catch (XmlException e) {
|
||||||
|
throw new POIXMLException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Then our table based text
|
// Then our table based text
|
||||||
Iterator<XWPFTable> j = document.getTablesIterator();
|
Iterator<XWPFTable> j = document.getTablesIterator();
|
||||||
while(j.hasNext()) {
|
while(j.hasNext()) {
|
||||||
text.append(j.next().getText()+"\n");
|
text.append(j.next().getText()).append('\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finish up with all the footers
|
// Finish up with all the footers
|
||||||
// TODO - put them in where they're needed
|
extractFooters(text, hfPolicy);
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
|
||||||
if(hfPolicy.getFirstPageFooter() != null) {
|
if(hfPolicy.getFirstPageFooter() != null) {
|
||||||
text.append( hfPolicy.getFirstPageFooter().getText() );
|
text.append( hfPolicy.getFirstPageFooter().getText() );
|
||||||
}
|
}
|
||||||
@ -110,7 +134,17 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||||||
if(hfPolicy.getDefaultFooter() != null) {
|
if(hfPolicy.getDefaultFooter() != null) {
|
||||||
text.append( hfPolicy.getDefaultFooter().getText() );
|
text.append( hfPolicy.getDefaultFooter().getText() );
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return text.toString();
|
private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
|
||||||
|
if(hfPolicy.getFirstPageHeader() != null) {
|
||||||
|
text.append( hfPolicy.getFirstPageHeader().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getEvenPageHeader() != null) {
|
||||||
|
text.append( hfPolicy.getEvenPageHeader().getText() );
|
||||||
|
}
|
||||||
|
if(hfPolicy.getDefaultHeader() != null) {
|
||||||
|
text.append( hfPolicy.getDefaultHeader().getText() );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -83,19 +83,26 @@ public class XWPFHeaderFooterPolicy {
|
|||||||
private XWPFHeader defaultHeader;
|
private XWPFHeader defaultHeader;
|
||||||
private XWPFFooter defaultFooter;
|
private XWPFFooter defaultFooter;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Figures out the policy for the given document,
|
* Figures out the policy for the given document,
|
||||||
* and creates any header and footer objects
|
* and creates any header and footer objects
|
||||||
* as required.
|
* as required.
|
||||||
*/
|
*/
|
||||||
public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
|
public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException {
|
||||||
|
this(doc, doc.getDocument().getBody().getSectPr());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Figures out the policy for the given document,
|
||||||
|
* and creates any header and footer objects
|
||||||
|
* as required.
|
||||||
|
*/
|
||||||
|
public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException {
|
||||||
// Grab what headers and footers have been defined
|
// Grab what headers and footers have been defined
|
||||||
// For now, we don't care about different ranges, as it
|
// For now, we don't care about different ranges, as it
|
||||||
// doesn't seem that .docx properly supports that
|
// doesn't seem that .docx properly supports that
|
||||||
// feature of the file format yet
|
// feature of the file format yet
|
||||||
this.doc = doc;
|
this.doc = doc;
|
||||||
CTSectPr sectPr = doc.getDocument().getBody().getSectPr();
|
|
||||||
for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
|
for(int i=0; i<sectPr.sizeOfHeaderReferenceArray(); i++) {
|
||||||
// Get the header
|
// Get the header
|
||||||
CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);
|
CTHdrFtrRef ref = sectPr.getHeaderReferenceArray(i);
|
||||||
|
@ -198,4 +198,13 @@ public class TestXWPFWordExtractor extends TestCase {
|
|||||||
assertTrue(extractor.getText().contains("extremely well"));
|
assertTrue(extractor.getText().contains("extremely well"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testParagraphHeader() {
|
||||||
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
|
||||||
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
||||||
|
|
||||||
|
assertTrue(extractor.getText().contains("Section 1"));
|
||||||
|
assertTrue(extractor.getText().contains("Section 2"));
|
||||||
|
assertTrue(extractor.getText().contains("Section 3"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
BIN
test-data/document/Headers.docx
Executable file
BIN
test-data/document/Headers.docx
Executable file
Binary file not shown.
Loading…
Reference in New Issue
Block a user