Friendly wrapper on HWPF for extracting text from Word Documents
git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377372 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c7793b1b6d
commit
1e6596b0e0
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
Executable file
Binary file not shown.
@ -0,0 +1,87 @@
|
|||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the different routes to extracting text
|
||||||
|
*
|
||||||
|
* @author Nick Burch (nick at torchbox dot com)
|
||||||
|
*/
|
||||||
|
public class TestDifferentRoutes extends TestCase {
|
||||||
|
private String[] p_text = new String[] {
|
||||||
|
"This is a simple word document\r",
|
||||||
|
"\r",
|
||||||
|
"It has a number of paragraphs in it\r",
|
||||||
|
"\r",
|
||||||
|
"Some of them even feature bold, italic and underlined text\r",
|
||||||
|
"\r",
|
||||||
|
"\r",
|
||||||
|
"This bit is in a different font and size\r",
|
||||||
|
"\r",
|
||||||
|
"\r",
|
||||||
|
"This bit features some red text.\r",
|
||||||
|
"\r",
|
||||||
|
"\r",
|
||||||
|
"It is otherwise very very boring.\r"
|
||||||
|
};
|
||||||
|
|
||||||
|
private HWPFDocument doc;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
String dirname = System.getProperty("HWPF.testdata.path");
|
||||||
|
|
||||||
|
String filename = dirname + "/test2.doc";
|
||||||
|
doc = new HWPFDocument(new FileInputStream(filename));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test model based extraction
|
||||||
|
*/
|
||||||
|
public void testExtractFromModel() {
|
||||||
|
Range r = doc.getRange();
|
||||||
|
|
||||||
|
String[] text = new String[r.numParagraphs()];
|
||||||
|
for(int i=0; i < r.numParagraphs(); i++) {
|
||||||
|
Paragraph p = r.getParagraph(i);
|
||||||
|
text[i] = p.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(p_text.length, text.length);
|
||||||
|
for(int i=0; i<p_text.length; i++) {
|
||||||
|
assertEquals(p_text[i], text[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test textPieces based extraction
|
||||||
|
*/
|
||||||
|
public void testExtractFromTextPieces() throws Exception {
|
||||||
|
StringBuffer textBuf = new StringBuffer();
|
||||||
|
|
||||||
|
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
||||||
|
while (textPieces.hasNext()) {
|
||||||
|
TextPiece piece = (TextPiece) textPieces.next();
|
||||||
|
|
||||||
|
String encoding = "Cp1252";
|
||||||
|
if (piece.usesUnicode()) {
|
||||||
|
encoding = "UTF-16LE";
|
||||||
|
}
|
||||||
|
String text = new String(piece.getRawBytes(), encoding);
|
||||||
|
textBuf.append(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuffer exp = new StringBuffer();
|
||||||
|
for(int i=0; i<p_text.length; i++) {
|
||||||
|
exp.append(p_text[i]);
|
||||||
|
}
|
||||||
|
assertEquals(exp.toString(), textBuf.toString());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,88 @@
|
|||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the different routes to extracting text
|
||||||
|
*
|
||||||
|
* @author Nick Burch (nick at torchbox dot com)
|
||||||
|
*/
|
||||||
|
public class TestWordExtractor extends TestCase {
|
||||||
|
private String[] p_text1 = new String[] {
|
||||||
|
"This is a simple word document\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"It has a number of paragraphs in it\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"Some of them even feature bold, italic and underlined text\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"This bit is in a different font and size\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"This bit features some red text.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"It is otherwise very very boring.\r\n"
|
||||||
|
};
|
||||||
|
private String p_text1_block = new String();
|
||||||
|
|
||||||
|
// Well behaved document
|
||||||
|
private WordExtractor extractor;
|
||||||
|
// Corrupted document - can't do paragraph based stuff
|
||||||
|
private WordExtractor extractor2;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
String dirname = System.getProperty("HWPF.testdata.path");
|
||||||
|
|
||||||
|
String filename = dirname + "/test2.doc";
|
||||||
|
String filename2 = dirname + "/test.doc";
|
||||||
|
extractor = new WordExtractor(new FileInputStream(filename));
|
||||||
|
extractor2 = new WordExtractor(new FileInputStream(filename2));
|
||||||
|
|
||||||
|
// Build splat'd out text version
|
||||||
|
for(int i=0; i<p_text1.length; i++) {
|
||||||
|
p_text1_block += p_text1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test paragraph based extraction
|
||||||
|
*/
|
||||||
|
public void testExtractFromParagraphs() {
|
||||||
|
String[] text = extractor.getParagraphText();
|
||||||
|
|
||||||
|
assertEquals(p_text1.length, text.length);
|
||||||
|
for(int i=0; i<p_text1.length; i++) {
|
||||||
|
assertEquals(p_text1[i], text[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// On second one, should fall back
|
||||||
|
assertEquals(1, extractor2.getParagraphText().length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the paragraph -> flat extraction
|
||||||
|
*/
|
||||||
|
public void testGetText() {
|
||||||
|
assertEquals(p_text1_block, extractor.getText());
|
||||||
|
|
||||||
|
// On second one, should fall back to text piece
|
||||||
|
assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test textPieces based extraction
|
||||||
|
*/
|
||||||
|
public void testExtractFromTextPieces() throws Exception {
|
||||||
|
String text = extractor.getTextFromPieces();
|
||||||
|
assertEquals(p_text1_block, text);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user