Friendly wrapper on HWPF for extracting text from Word Documents

git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377372 13f79535-47bb-0310-9956-ffa450edef68
2006-02-13 12:59:00 +00:00 · 2006-02-13 12:59:00 +00:00 · 1e6596b0e0
commit 1e6596b0e0
parent c7793b1b6d
3 changed files with 175 additions and 0 deletions
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java
@ -0,0 +1,87 @@
 package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.util.Iterator;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import junit.framework.TestCase;
 /**
 * Test the different routes to extracting text
 *
 * @author Nick Burch (nick at torchbox dot com)
 */
 public class TestDifferentRoutes extends TestCase {
 	private String[] p_text = new String[] {
 			"This is a simple word document\r",
 			"\r",
 			"It has a number of paragraphs in it\r",
 			"\r",
 			"Some of them even feature bold, italic and underlined text\r",
 			"\r",
 			"\r",
 			"This bit is in a different font and size\r",
 			"\r",
 			"\r",
 			"This bit features some red text.\r",
 			"\r",
 			"\r",
 			"It is otherwise very very boring.\r"
 	};
 	private HWPFDocument doc;
    protected void setUp() throws Exception {
 		String dirname = System.getProperty("HWPF.testdata.path");
 		String filename = dirname + "/test2.doc";
 		doc = new HWPFDocument(new FileInputStream(filename));
    }			
    /**
     * Test model based extraction
     */
    public void testExtractFromModel() {
    	Range r = doc.getRange();
    	String[] text = new String[r.numParagraphs()];
    	for(int i=0; i < r.numParagraphs(); i++) {
    		Paragraph p = r.getParagraph(i);
    		text[i] = p.text();
    	}
    	assertEquals(p_text.length, text.length);
    	for(int i=0; i<p_text.length; i++) {
    		assertEquals(p_text[i], text[i]);
    	}
    }
    /**
     * Test textPieces based extraction
     */
    public void testExtractFromTextPieces() throws Exception {
    	StringBuffer textBuf = new StringBuffer();
    	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    	while (textPieces.hasNext()) {
    		TextPiece piece = (TextPiece) textPieces.next();
    		String encoding = "Cp1252";
    		if (piece.usesUnicode()) {
    			encoding = "UTF-16LE";
    		}
    		String text = new String(piece.getRawBytes(), encoding);
    		textBuf.append(text);
    	}
    	StringBuffer exp = new StringBuffer();
    	for(int i=0; i<p_text.length; i++) {
    		exp.append(p_text[i]);
    	}
    	assertEquals(exp.toString(), textBuf.toString());
    }
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -0,0 +1,88 @@
 package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.util.Iterator;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import junit.framework.TestCase;
 /**
 * Test the different routes to extracting text
 *
 * @author Nick Burch (nick at torchbox dot com)
 */
 public class TestWordExtractor extends TestCase {
 	private String[] p_text1 = new String[] {
 			"This is a simple word document\r\n",
 			"\r\n",
 			"It has a number of paragraphs in it\r\n",
 			"\r\n",
 			"Some of them even feature bold, italic and underlined text\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit is in a different font and size\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit features some red text.\r\n",
 			"\r\n",
 			"\r\n",
 			"It is otherwise very very boring.\r\n"
 	};
 	private String p_text1_block = new String();
 	// Well behaved document
 	private WordExtractor extractor;
 	// Corrupted document - can't do paragraph based stuff
 	private WordExtractor extractor2;
    protected void setUp() throws Exception {
 		String dirname = System.getProperty("HWPF.testdata.path");
 		String filename = dirname + "/test2.doc";
 		String filename2 = dirname + "/test.doc";
 		extractor = new WordExtractor(new FileInputStream(filename));
 		extractor2 = new WordExtractor(new FileInputStream(filename2));
 		// Build splat'd out text version
 		for(int i=0; i<p_text1.length; i++) {
 			p_text1_block += p_text1[i];
 		}
    }			
    /**
     * Test paragraph based extraction
     */
    public void testExtractFromParagraphs() {
    	String[] text = extractor.getParagraphText();
    	assertEquals(p_text1.length, text.length);
    	for(int i=0; i<p_text1.length; i++) {
    		assertEquals(p_text1[i], text[i]);
    	}
    	// On second one, should fall back
    	assertEquals(1, extractor2.getParagraphText().length);
    }
    /**
     * Test the paragraph -> flat extraction
     */
    public void testGetText() {
    	assertEquals(p_text1_block, extractor.getText());
    	// On second one, should fall back to text piece
    	assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
    }
    /**
     * Test textPieces based extraction
     */
    public void testExtractFromTextPieces() throws Exception {
    	String text = extractor.getTextFromPieces();
    	assertEquals(p_text1_block, text);
    }
 }