Friendly wrapper on HWPF for extracting text from Word Documents
git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377371 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dde8b84f03
commit
c7793b1b6d
@ -0,0 +1,123 @@
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Class to extract the text from a Word Document.
|
||||
*
|
||||
* You should use either getParagraphText() or getText() unless
|
||||
* you have a strong reason otherwise.
|
||||
*
|
||||
* @author Nick Burch (nick at torchbox dot com)
|
||||
*/
|
||||
public class WordExtractor {
|
||||
private POIFSFileSystem fs;
|
||||
private HWPFDocument doc;
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param is InputStream containing the word file
|
||||
*/
|
||||
public WordExtractor(InputStream is) throws IOException {
|
||||
this(new POIFSFileSystem(is));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param fs POIFSFileSystem containing the word file
|
||||
*/
|
||||
public WordExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this.fs = fs;
|
||||
doc = new HWPFDocument(fs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text from the word file, as an array with one String
|
||||
* per paragraph
|
||||
*/
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
|
||||
// Extract using the model code
|
||||
try {
|
||||
Range r = doc.getRange();
|
||||
|
||||
ret = new String[r.numParagraphs()];
|
||||
for(int i=0; i<ret.length; i++) {
|
||||
Paragraph p = r.getParagraph(i);
|
||||
ret[i] = p.text();
|
||||
|
||||
// Fix the line ending
|
||||
if(ret[i].endsWith("\r")) {
|
||||
ret[i] = ret[i] + "\n";
|
||||
}
|
||||
}
|
||||
} catch(Exception e) {
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[1];
|
||||
ret[0] = getTextFromPieces();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text out of the text pieces. Might also include various
|
||||
* bits of crud, but will work in cases where the text piece -> paragraph
|
||||
* mapping is broken. Fast too.
|
||||
*/
|
||||
public String getTextFromPieces() {
|
||||
StringBuffer textBuf = new StringBuffer();
|
||||
|
||||
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
||||
while (textPieces.hasNext()) {
|
||||
TextPiece piece = (TextPiece) textPieces.next();
|
||||
|
||||
String encoding = "Cp1252";
|
||||
if (piece.usesUnicode()) {
|
||||
encoding = "UTF-16LE";
|
||||
}
|
||||
try {
|
||||
String text = new String(piece.getRawBytes(), encoding);
|
||||
textBuf.append(text);
|
||||
} catch(UnsupportedEncodingException e) {
|
||||
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
|
||||
}
|
||||
}
|
||||
|
||||
String text = textBuf.toString();
|
||||
|
||||
// Fix line endings (Note - won't get all of them
|
||||
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
|
||||
text = text.replaceAll("\r\r", "\r\n\r\n");
|
||||
|
||||
if(text.endsWith("\r")) {
|
||||
text += "\n";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/*
|
||||
* Grab the text, based on the paragraphs. Shouldn't include any crud,
|
||||
* but slightly slower than getTextFromPieces()
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer ret = new StringBuffer();
|
||||
String[] text = getParagraphText();
|
||||
for(int i=0; i<text.length; i++) {
|
||||
ret.append(text[i]);
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user