Friendly wrapper on HWPF for extracting text from Word Documents

git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2006-02-13 12:58:52 +00:00
parent dde8b84f03
commit c7793b1b6d

View File

@ -0,0 +1,123 @@
package org.apache.poi.hwpf.extractor;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Class to extract the text from a Word Document.
*
* You should use either getParagraphText() or getText() unless
* you have a strong reason otherwise.
*
* @author Nick Burch (nick at torchbox dot com)
*/
public class WordExtractor {
private POIFSFileSystem fs;
private HWPFDocument doc;
/**
* Create a new Word Extractor
* @param is InputStream containing the word file
*/
public WordExtractor(InputStream is) throws IOException {
this(new POIFSFileSystem(is));
}
/**
* Create a new Word Extractor
* @param fs POIFSFileSystem containing the word file
*/
public WordExtractor(POIFSFileSystem fs) throws IOException {
this.fs = fs;
doc = new HWPFDocument(fs);
}
/**
* Get the text from the word file, as an array with one String
* per paragraph
*/
public String[] getParagraphText() {
String[] ret;
// Extract using the model code
try {
Range r = doc.getRange();
ret = new String[r.numParagraphs()];
for(int i=0; i<ret.length; i++) {
Paragraph p = r.getParagraph(i);
ret[i] = p.text();
// Fix the line ending
if(ret[i].endsWith("\r")) {
ret[i] = ret[i] + "\n";
}
}
} catch(Exception e) {
// Something's up with turning the text pieces into paragraphs
// Fall back to ripping out the text pieces
ret = new String[1];
ret[0] = getTextFromPieces();
}
return ret;
}
/**
* Grab the text out of the text pieces. Might also include various
* bits of crud, but will work in cases where the text piece -> paragraph
* mapping is broken. Fast too.
*/
public String getTextFromPieces() {
StringBuffer textBuf = new StringBuffer();
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
while (textPieces.hasNext()) {
TextPiece piece = (TextPiece) textPieces.next();
String encoding = "Cp1252";
if (piece.usesUnicode()) {
encoding = "UTF-16LE";
}
try {
String text = new String(piece.getRawBytes(), encoding);
textBuf.append(text);
} catch(UnsupportedEncodingException e) {
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
}
}
String text = textBuf.toString();
// Fix line endings (Note - won't get all of them
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
text = text.replaceAll("\r\r", "\r\n\r\n");
if(text.endsWith("\r")) {
text += "\n";
}
return text;
}
/*
* Grab the text, based on the paragraphs. Shouldn't include any crud,
* but slightly slower than getTextFromPieces()
*/
public String getText() {
StringBuffer ret = new StringBuffer();
String[] text = getParagraphText();
for(int i=0; i<text.length; i++) {
ret.append(text[i]);
}
return ret.toString();
}
}