2007-12-30 11:53:42 -05:00
|
|
|
/* ====================================================================
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
(the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
==================================================================== */
|
2008-03-09 10:39:36 -04:00
|
|
|
package org.apache.poi.xwpf.extractor;
|
2007-12-30 11:53:42 -05:00
|
|
|
|
|
|
|
import java.io.IOException;
|
2008-04-27 12:36:51 -04:00
|
|
|
import java.util.Iterator;
|
2007-12-30 11:53:42 -05:00
|
|
|
|
2008-03-09 10:39:36 -04:00
|
|
|
import org.apache.poi.POIXMLDocument;
|
2007-12-30 11:53:42 -05:00
|
|
|
import org.apache.poi.POIXMLTextExtractor;
|
2009-08-31 13:02:06 -04:00
|
|
|
import org.apache.poi.POIXMLException;
|
2009-03-18 14:54:01 -04:00
|
|
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
|
|
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
2008-04-27 12:36:51 -04:00
|
|
|
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
2008-08-09 11:08:11 -04:00
|
|
|
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
2008-04-27 12:36:51 -04:00
|
|
|
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
|
|
|
|
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
|
2009-03-18 14:54:01 -04:00
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
2008-04-27 12:36:51 -04:00
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
2007-12-30 11:53:42 -05:00
|
|
|
import org.apache.xmlbeans.XmlException;
|
2009-08-31 13:02:06 -04:00
|
|
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
|
2007-12-30 11:53:42 -05:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Helper class to extract text from an OOXML Word file
|
|
|
|
*/
|
2008-03-09 10:39:36 -04:00
|
|
|
public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|
|
|
private XWPFDocument document;
|
2008-04-09 08:22:23 -04:00
|
|
|
private boolean fetchHyperlinks = false;
|
2007-12-30 11:53:42 -05:00
|
|
|
|
2009-03-18 14:54:01 -04:00
|
|
|
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
2008-03-09 10:39:36 -04:00
|
|
|
this(new XWPFDocument(container));
|
2007-12-30 11:53:42 -05:00
|
|
|
}
|
2008-03-09 10:39:36 -04:00
|
|
|
public XWPFWordExtractor(XWPFDocument document) {
|
2007-12-30 11:53:42 -05:00
|
|
|
super(document);
|
|
|
|
this.document = document;
|
|
|
|
}
|
2008-04-27 12:36:51 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Should we also fetch the hyperlinks, when fetching
|
|
|
|
* the text content? Default is to only output the
|
|
|
|
* hyperlink label, and not the contents
|
|
|
|
*/
|
|
|
|
public void setFetchHyperlinks(boolean fetch) {
|
|
|
|
fetchHyperlinks = fetch;
|
|
|
|
}
|
2007-12-30 11:53:42 -05:00
|
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
|
if(args.length < 1) {
|
|
|
|
System.err.println("Use:");
|
2008-05-23 11:05:12 -04:00
|
|
|
System.err.println(" HXFWordExtractor <filename.docx>");
|
2007-12-30 11:53:42 -05:00
|
|
|
System.exit(1);
|
|
|
|
}
|
|
|
|
POIXMLTextExtractor extractor =
|
2008-03-09 10:39:36 -04:00
|
|
|
new XWPFWordExtractor(POIXMLDocument.openPackage(
|
|
|
|
args[0]
|
2007-12-30 11:53:42 -05:00
|
|
|
));
|
|
|
|
System.out.println(extractor.getText());
|
|
|
|
}
|
2008-04-09 08:22:23 -04:00
|
|
|
|
2007-12-30 11:53:42 -05:00
|
|
|
public String getText() {
|
|
|
|
StringBuffer text = new StringBuffer();
|
2008-08-09 11:08:11 -04:00
|
|
|
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
2009-08-31 13:02:06 -04:00
|
|
|
|
2008-08-09 11:08:11 -04:00
|
|
|
// Start out with all headers
|
2009-08-31 13:02:06 -04:00
|
|
|
extractHeaders(text, hfPolicy);
|
2008-08-09 11:08:11 -04:00
|
|
|
|
|
|
|
// First up, all our paragraph based text
|
2008-04-27 12:36:51 -04:00
|
|
|
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
|
|
|
|
while(i.hasNext()) {
|
2009-08-31 13:02:06 -04:00
|
|
|
XWPFParagraph paragraph = i.next();
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
CTSectPr ctSectPr = null;
|
|
|
|
if (paragraph.getCTP().getPPr()!=null) {
|
|
|
|
ctSectPr = paragraph.getCTP().getPPr().getSectPr();
|
|
|
|
}
|
|
|
|
|
|
|
|
XWPFHeaderFooterPolicy headerFooterPolicy = null;
|
|
|
|
|
|
|
|
if (ctSectPr!=null) {
|
|
|
|
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
|
|
|
|
|
|
|
|
extractHeaders(text, headerFooterPolicy);
|
|
|
|
}
|
|
|
|
|
|
|
|
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
|
|
|
|
new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks));
|
|
|
|
text.append(decorator.getText()).append('\n');
|
|
|
|
|
|
|
|
if (ctSectPr!=null) {
|
|
|
|
extractFooters(text, headerFooterPolicy);
|
|
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw new POIXMLException(e);
|
|
|
|
} catch (XmlException e) {
|
|
|
|
throw new POIXMLException(e);
|
|
|
|
}
|
|
|
|
}
|
2008-08-09 11:08:11 -04:00
|
|
|
|
|
|
|
// Then our table based text
|
2008-04-27 12:36:51 -04:00
|
|
|
Iterator<XWPFTable> j = document.getTablesIterator();
|
2008-08-09 11:08:11 -04:00
|
|
|
while(j.hasNext()) {
|
2009-08-31 13:02:06 -04:00
|
|
|
text.append(j.next().getText()).append('\n');
|
2007-12-30 11:53:42 -05:00
|
|
|
}
|
|
|
|
|
2008-08-09 11:08:11 -04:00
|
|
|
// Finish up with all the footers
|
2009-08-31 13:02:06 -04:00
|
|
|
extractFooters(text, hfPolicy);
|
2008-08-09 11:08:11 -04:00
|
|
|
|
2007-12-30 11:53:42 -05:00
|
|
|
return text.toString();
|
|
|
|
}
|
2009-08-31 13:02:06 -04:00
|
|
|
|
|
|
|
private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
|
|
|
|
if(hfPolicy.getFirstPageFooter() != null) {
|
|
|
|
text.append( hfPolicy.getFirstPageFooter().getText() );
|
|
|
|
}
|
|
|
|
if(hfPolicy.getEvenPageFooter() != null) {
|
|
|
|
text.append( hfPolicy.getEvenPageFooter().getText() );
|
|
|
|
}
|
|
|
|
if(hfPolicy.getDefaultFooter() != null) {
|
|
|
|
text.append( hfPolicy.getDefaultFooter().getText() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) {
|
|
|
|
if(hfPolicy.getFirstPageHeader() != null) {
|
|
|
|
text.append( hfPolicy.getFirstPageHeader().getText() );
|
|
|
|
}
|
|
|
|
if(hfPolicy.getEvenPageHeader() != null) {
|
|
|
|
text.append( hfPolicy.getEvenPageHeader().getText() );
|
|
|
|
}
|
|
|
|
if(hfPolicy.getDefaultHeader() != null) {
|
|
|
|
text.append( hfPolicy.getDefaultHeader().getText() );
|
|
|
|
}
|
|
|
|
}
|
2007-12-30 11:53:42 -05:00
|
|
|
}
|