Add Word-to-Text converter and use it as replacement for WordExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68
2011-08-09 12:38:52 +00:00 · 2011-08-09 12:38:52 +00:00 · 1d9900c184
commit 1d9900c184
parent b47081db42
26 changed files with 1117 additions and 488 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@

    <changes>
        <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
           <action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
           <action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
           <action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -61,11 +62,19 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
 		return new HPSFPropertiesExtractor(this);
 	}

+    public DirectoryEntry getRoot()
+    {
+        return document.directory;
+    }
+
    /**
-	 * Return the underlying POIFS FileSystem of
-	 *  this document.
+     * Return the underlying POIFS FileSystem of this document.
+     *
+     * @deprecated Use {@link #getRoot()} instead
     */
-	public POIFSFileSystem getFileSystem() {
+    @Deprecated
+    public POIFSFileSystem getFileSystem()
+    {
        return document.directory.getFileSystem();
    }
 }
--- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 */
 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
   private DirectoryNode _dir;
-	private POIFSFileSystem _fs;
 	boolean _includeSheetNames = true;
 	boolean _formulasNotResults = false;

-	public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
+    /**
+     * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+    {
+        this( dir );
+    }
+
+    public EventBasedExcelExtractor( DirectoryNode dir )
+    {
        super( null );
        _dir = dir;
-		_fs = fs;
    }
+
   public EventBasedExcelExtractor(POIFSFileSystem fs) {
-      this(fs.getRoot(), fs);
+      this(fs.getRoot());
   }

   /**
@ -79,7 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
    *  this document.
    */
   public POIFSFileSystem getFileSystem() {
-      return _fs;
+      return _dir.getFileSystem();
   }

 	/**
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@ -24,7 +24,6 @@ import java.io.InputStream;
 import java.io.PrintStream;

 import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFCellStyle;
 import org.apache.poi.hssf.usermodel.HSSFComment;
@ -35,6 +34,7 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.ss.usermodel.HeaderFooter;

 /**
@ -66,10 +66,18 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 		_formatter = new HSSFDataFormatter();
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
+		this(fs.getRoot());
 	}
+	/**
+     * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
    public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HSSFWorkbook(dir, fs, true));
+        this( dir );
+    }
+    public ExcelExtractor(DirectoryNode dir) throws IOException {
+		this(new HSSFWorkbook(dir, true));
 	}

 	private static final class CommandParseException extends Exception {
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
@ -19,9 +19,10 @@

 package org.apache.poi.poifs.filesystem;

-import java.io.*;
-
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;

 import org.apache.poi.hpsf.ClassID;

@ -67,6 +68,12 @@ public interface DirectoryEntry

    public int getEntryCount();

+    /**
+     * Checks if entry with specified name present
+     */
+
+    public boolean hasEntry( final String name );
+
    /**
     * get a specified Entry by name
     *
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
@ -342,6 +342,11 @@ public class DirectoryNode
        return _entries.size();
    }

+    public boolean hasEntry( String name )
+    {
+        return name != null && _byname.containsKey( name );
+    }
+
    /**
     * get a specified Entry by name
     *
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@ -193,59 +193,73 @@ public class ExtractorFactory {

 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 	   // Only ever an OLE2 one from the root of the FS
-		return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+		return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
 	}
-	public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+
+    /**
+     * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings("unused")
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+            throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+    {
+        return createExtractor(poifsDir);
+    }
+
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+            InvalidFormatException, OpenXML4JException, XmlException
+    {
        // Look for certain entries in the stream, to figure it
        // out from
+        if (poifsDir.hasEntry("Workbook")) {
+            if (getPreferEventExtractor()) {
+                return new EventBasedExcelExtractor(poifsDir);
+            }
+            return new ExcelExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("WordDocument")) {
+            // Old or new style word document?
+            try {
+                return new WordExtractor(poifsDir);
+            } catch (OldWordFileFormatException e) {
+                return new Word6Extractor(poifsDir);
+            }
+        }
+
+        if (poifsDir.hasEntry("PowerPoint Document")) {
+            return new PowerPointExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("VisioDocument")) {
+            return new VisioTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("Quill")) {
+            return new PublisherTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+                || poifsDir.hasEntry("__substg1.0_0047001E")
+                || poifsDir.hasEntry("__substg1.0_0047001F")
+                || poifsDir.hasEntry("__substg1.0_0037001E")
+                || poifsDir.hasEntry("__substg1.0_0037001F"))
+        {
+            return new OutlookTextExtactor(poifsDir);
+        }
+
        for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
            Entry entry = entries.next();

-			if(entry.getName().equals("Workbook")) {
-			   if(getPreferEventExtractor()) {
-               return new EventBasedExcelExtractor(poifsDir, fs);
-			   } else {
-			      return new ExcelExtractor(poifsDir, fs);
-			   }
-			}
-			if(entry.getName().equals("WordDocument")) {
-			    // Old or new style word document?
-			    try {
-			        return new WordExtractor(poifsDir, fs);
-			    } catch(OldWordFileFormatException e) {
-			        return new Word6Extractor(poifsDir, fs);
-			    }
-			}
-			if(entry.getName().equals("PowerPoint Document")) {
-				return new PowerPointExtractor(poifsDir, fs);
-			}
-			if(entry.getName().equals("VisioDocument")) {
-				return new VisioTextExtractor(poifsDir, fs);
-			}
-         if(entry.getName().equals("Quill")) {
-            return new PublisherTextExtractor(poifsDir, fs);
-         }
-			if(
-                entry.getName().equals("__substg1.0_1000001E") ||
-                entry.getName().equals("__substg1.0_1000001F") ||
-                entry.getName().equals("__substg1.0_0047001E") ||
-                entry.getName().equals("__substg1.0_0047001F") ||
-                entry.getName().equals("__substg1.0_0037001E") ||
-                entry.getName().equals("__substg1.0_0037001F")
-			) {
-			   return new OutlookTextExtactor(poifsDir, fs);
-			}
            if (entry.getName().equals("Package")) {
-			   OPCPackage pkg = OPCPackage.open(
-			         poifsDir.createDocumentInputStream(entry.getName())
-			   );
+                OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
                return createExtractor(pkg);
            }
        }
        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
    }

-	
 	/**
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
@ -260,14 +274,14 @@ public class ExtractorFactory {
 		ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();

      // Find all the embeded directories
-		POIFSFileSystem fs = ext.getFileSystem();
-		if(fs == null) {
+		DirectoryEntry root = ext.getRoot();
+		if(root == null) {
 			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
 		}

 		if(ext instanceof ExcelExtractor) {
 			// These are in MBD... under the root
-			Iterator<Entry> it = fs.getRoot().getEntries();
+			Iterator<Entry> it = root.getEntries();
 			while(it.hasNext()) {
 				Entry entry = it.next();
 				if(entry.getName().startsWith("MBD")) {
@ -278,7 +292,7 @@ public class ExtractorFactory {
 			// These are in ObjectPool -> _... under the root
 			try {
 				DirectoryEntry op = (DirectoryEntry)
-					fs.getRoot().getEntry("ObjectPool");
+				        root.getEntry("ObjectPool");
 				Iterator<Entry> it = op.getEntries();
 				while(it.hasNext()) {
 					Entry entry = it.next();
@ -314,7 +328,7 @@ public class ExtractorFactory {
 		ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
 		for(int i=0; i<dirs.size(); i++) {
 			e.add( createExtractor(
-					(DirectoryNode)dirs.get(i), ext.getFileSystem()
+					(DirectoryNode)dirs.get(i)
 			) );
 		}
 		for(int i=0; i<nonPOIFS.size(); i++) {
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;

+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.hwpf.model.BookmarksTables;
 import org.apache.poi.hwpf.model.CHPBinTable;
 import org.apache.poi.hwpf.model.CPSplitCalculator;
@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
   * @param pfilesystem The POIFSFileSystem that contains the Word document.
   * @throws IOException If there is an unexpected IOException from the passed
   *         in POIFSFileSystem.
+   * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
   */
+  @Deprecated
  public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
  {
     this(directory);
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@ -17,10 +17,17 @@

 package org.apache.poi.hwpf;

+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;

+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.EncryptedDocumentException;
 import org.apache.poi.POIDocument;
 import org.apache.poi.hwpf.model.CHPBinTable;
@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
 */
 public abstract class HWPFDocumentCore extends POIDocument
 {
+  /** Holds OLE2 objects */
+  protected ObjectPoolImpl _objectPool;
+
  /** The FIB */
  protected FileInformationBlock _fib;

@ -148,6 +158,20 @@ public abstract class HWPFDocumentCore extends POIDocument
    if(_fib.isFEncrypted()) {
    	throw new EncryptedDocumentException("Cannot process encrypted word files!");
    }
+
+        {
+            DirectoryEntry objectPoolEntry;
+            try
+            {
+                objectPoolEntry = (DirectoryEntry) directory
+                        .getEntry( "ObjectPool" );
+            }
+            catch ( FileNotFoundException exc )
+            {
+                objectPoolEntry = directory.createDirectory( "ObjectPool" );
+            }
+            _objectPool = new ObjectPoolImpl( objectPoolEntry );
+        }
    }

    /**
@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
    return _fib;
  }

+    public ObjectsPool getObjectsPool()
+    {
+        return _objectPool;
+    }
+
    public abstract TextPieceTable getTextTable();
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
        this(fs.getRoot());
    }

+    @Deprecated
    public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
            throws IOException {
       this(directory);
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
@ -56,6 +57,32 @@ import org.w3c.dom.Element;
@Beta
 public abstract class AbstractWordConverter
 {
+    private static final class Structure implements Comparable<Structure>
+    {
+        final int end;
+        final int start;
+        final Object structure;
+
+        Structure( Bookmark bookmark )
+        {
+            this.start = bookmark.getStart();
+            this.end = bookmark.getEnd();
+            this.structure = bookmark;
+        }
+
+        Structure( Field field )
+        {
+            this.start = field.getFieldStartOffset();
+            this.end = field.getFieldEndOffset();
+            this.structure = field;
+        }
+
+        public int compareTo( Structure o )
+        {
+            return start < o.start ? -1 : start == o.start ? 0 : 1;
+        }
+    }
+
    private static final byte BEL_MARK = 7;

    private static final byte FIELD_BEGIN_MARK = 19;
@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
                    processDrawnObject( doc, characterRun, block );
                    continue;
                }
+                if ( characterRun.isOle2()
+                        && ( wordDocument instanceof HWPFDocument ) )
+                {
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
+                    processOle2( doc, characterRun, block );
+                    continue;
+                }
            }

            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
            CharacterRun characterRun, OfficeDrawing officeDrawing,
            String path, Element block );

-    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range endnoteTextRange );
+    protected abstract void processEndnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range endnoteTextRange );

-    protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+    protected void processField( HWPFDocument wordDocument, Range parentRange,
            int currentTableLevel, Field field, Element currentBlock )
    {
        switch ( field.getType() )
@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
                if ( matcher.find() )
                {
                    String pageref = matcher.group( 1 );
-                    processPageref( hwpfDocument, currentBlock,
+                    processPageref( wordDocument, currentBlock,
                            field.secondSubrange( parentRange ),
                            currentTableLevel, pageref );
                    return;
@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
            }
            break;
        }
+        case 58: // Embedded Object
+        {
+            if ( !field.hasSeparator() )
+            {
+                logger.log( POILogger.WARN, parentRange + " contains " + field
+                        + " with 'Embedded Object' but without separator mark" );
+                return;
+            }
+
+            CharacterRun separator = field
+                    .getMarkSeparatorCharacterRun( parentRange );
+
+            if ( separator.isOle2() )
+            {
+                // the only supported so far
+                boolean processed = processOle2( wordDocument, separator,
+                        currentBlock );
+
+                // if we didn't output OLE - output field value
+                if ( !processed )
+                {
+                    processCharacters( wordDocument, currentTableLevel,
+                            field.secondSubrange( parentRange ), currentBlock );
+                }
+
+                return;
+            }
+
+            break;
+        }
        case 88: // hyperlink
        {
            final Range firstSubrange = field.firstSubrange( parentRange );
@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
                if ( matcher.find() )
                {
                    String hyperlink = matcher.group( 1 );
-                    processHyperlink( hwpfDocument, currentBlock,
+                    processHyperlink( wordDocument, currentBlock,
                            field.secondSubrange( parentRange ),
                            currentTableLevel, hyperlink );
                    return;
@ -665,12 +730,13 @@ public abstract class AbstractWordConverter

        logger.log( POILogger.WARN, parentRange + " contains " + field
                + " with unsupported type or format" );
-        processCharacters( hwpfDocument, currentTableLevel,
+        processCharacters( wordDocument, currentTableLevel,
                field.secondSubrange( parentRange ), currentBlock );
    }

-    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range footnoteTextRange );
+    protected abstract void processFootnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range footnoteTextRange );

    protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
        }
    }

+    private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+            Element block )
+    {
+        Entry entry = doc.getObjectsPool().getObjectById(
+                "_" + characterRun.getPicOffset() );
+        if ( entry == null )
+        {
+            logger.log( POILogger.WARN, "Referenced OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ),
+                    "' not found in ObjectPool" );
+            return false;
+        }
+
+        try
+        {
+            return processOle2( doc, block, entry );
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.WARN,
+                    "Unable to convert internal OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+                    exc );
+            return false;
+        }
+    }
+
+    @SuppressWarnings( "unused" )
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        return false;
+    }
+
    protected abstract void processPageref( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
            String pageref );
@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
        return endMark;
    }

-    private static final class Structure implements Comparable<Structure>
-    {
-        final int end;
-        final int start;
-        final Object structure;
-
-        Structure( Bookmark bookmark )
-        {
-            this.start = bookmark.getStart();
-            this.end = bookmark.getEnd();
-            this.structure = bookmark;
-        }
-
-        Structure( Field field )
-        {
-            this.start = field.getFieldStartOffset();
-            this.end = field.getFieldEndOffset();
-            this.structure = field;
-        }
-
-        public int compareTo( Structure o )
-        {
-            return start < o.start ? -1 : start == o.start ? 0 : 1;
-        }
-    }
-
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
@ -422,6 +423,19 @@ public class AbstractWordUtils
        return !isEmpty( str );
    }

+    public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+            throws IOException
+    {
+        try
+        {
+            return new HWPFDocument( root );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( root );
+        }
+    }
+
    public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
    {
        final FileInputStream istream = new FileInputStream( docFile );
@ -438,16 +452,13 @@ public class AbstractWordUtils
    public static HWPFDocumentCore loadDoc( InputStream inputStream )
            throws IOException
    {
-        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
-                .verifyAndBuildPOIFS( inputStream );
-        try
-        {
-            return new HWPFDocument( poifsFileSystem );
+        return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
    }
-        catch ( OldWordFileFormatException exc )
+
+    public static HWPFDocumentCore loadDoc(
+            final POIFSFileSystem poifsFileSystem ) throws IOException
    {
-            return new HWPFOldDocument( poifsFileSystem );
-        }
+        return loadDoc( poifsFileSystem.getRoot() );
    }

    static String substringBeforeLast( String str, String separator )
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
    }

    @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
    {
        final String textIndex = String.valueOf( internalLinkCounter
                .incrementAndGet() );
@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
        setId( backwardLink, forwardLinkName );
        endnote.appendChild( backwardLink );

-        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+                endnote );

        WordToFoUtils.compactInlines( endnote );
        this.endnotes.add( endnote );
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
@Beta
 public class WordToHtmlConverter extends AbstractWordConverter
 {
-
    /**
     * Holds properties values, applied to current <tt>p</tt> element. Those
     * properties shall not be doubled in children <tt>span</tt> elements.
@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
    }

    @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
    {
-        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+                endnoteTextRange );
    }

    @Override
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;

 import java.io.File;
 import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;

+import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@ -33,6 +39,29 @@ import org.w3c.dom.Element;
 public class WordToTextConverter extends AbstractWordConverter
 {

+    public static String getText( DirectoryNode root ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+        return getText( wordDocument );
+    }
+
+    public static String getText( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        return getText( wordDocument );
+    }
+
+    public static String getText( final HWPFDocumentCore wordDocument )
+            throws Exception
+    {
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getText();
+    }
+
    /**
     * Java main() interface to interact with {@link WordToTextConverter}
     * 
@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter

    private Element notes = null;

+    private boolean outputSummaryInformation = false;
+
    private final TextDocumentFacade textDocumentFacade;

+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
+     * @throws ParserConfigurationException
+     *             if an internal {@link DocumentBuilder} cannot be created
+     */
+    public WordToTextConverter() throws ParserConfigurationException
+    {
+        this.textDocumentFacade = new TextDocumentFacade(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+    }
+
    /**
     * Creates new instance of {@link WordToTextConverter}. Can be used for
     * output several {@link HWPFDocument}s into single text document.
@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
        return textDocumentFacade.getDocument();
    }

+    public String getText() throws Exception
+    {
+        StringWriter stringWriter = new StringWriter();
+        DOMSource domSource = new DOMSource( getDocument() );
+        StreamResult streamResult = new StreamResult( stringWriter );
+
+        TransformerFactory tf = TransformerFactory.newInstance();
+        Transformer serializer = tf.newTransformer();
+        // TODO set encoding from a command argument
+        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+        serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+        serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+        serializer.transform( domSource, streamResult );
+
+        return stringWriter.toString();
+    }
+
+    public boolean isOutputSummaryInformation()
+    {
+        return outputSummaryInformation;
+    }
+
    @Override
    protected void outputCharacters( Element block, CharacterRun characterRun,
            String text )
@ -137,6 +204,8 @@ public class WordToTextConverter extends AbstractWordConverter
    @Override
    protected void processDocumentInformation(
            SummaryInformation summaryInformation )
+    {
+        if ( isOutputSummaryInformation() )
        {
            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
                textDocumentFacade.setTitle( summaryInformation.getTitle() );
@ -144,12 +213,16 @@ public class WordToTextConverter extends AbstractWordConverter
            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
                textDocumentFacade.addAuthor( summaryInformation.getAuthor() );

-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
-            textDocumentFacade
-                    .addDescription( summaryInformation.getComments() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getComments() ) )
+                textDocumentFacade.addDescription( summaryInformation
+                        .getComments() );

-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
-            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getKeywords() ) )
+                textDocumentFacade.addKeywords( summaryInformation
+                        .getKeywords() );
+        }
    }

    @Override
@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
        note.appendChild( textDocumentFacade.createText( "\n" ) );
    }

+    @Override
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        if ( !( entry instanceof DirectoryNode ) )
+            return false;
+        DirectoryNode directoryNode = (DirectoryNode) entry;
+
+        // even if no ExtractorFactory in classpath
+        if ( directoryNode.hasEntry( "WordDocument" ) )
+        {
+            String text = WordToTextConverter.getText( (DirectoryNode) entry );
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+
+        try
+        {
+            Class<?> cls = Class
+                    .forName( "org.apache.poi.extractor.ExtractorFactory" );
+            Method createExtractor = cls.getMethod( "createExtractor",
+                    DirectoryNode.class );
+            Object extractor = createExtractor.invoke( null, directoryNode );
+
+            Method getText = extractor.getClass().getMethod( "getText" );
+            String text = (String) getText.invoke( extractor );
+
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+        catch ( ClassNotFoundException exc )
+        {
+            // no extractor in classpath
+        }
+
+        return false;
+    }
+
    @Override
    protected void processPageref( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
        textDocumentFacade.body.appendChild( sectionElement );
    }

-    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
            Table table )
    {
        final int tableRows = table.numRows();
@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
                    tableCellElement.appendChild( textDocumentFacade
                            .createText( "\t" ) );

-                processParagraphes( hwpfDocument, tableCellElement, tableCell,
-                        table.getTableLevel() );
+                processCharacters( wordDocument, table.getTableLevel(),
+                        tableCell, tableCellElement );
                tableRowElement.appendChild( tableCellElement );
            }

@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
        }
    }

+    public void setOutputSummaryInformation( boolean outputDocumentInformation )
+    {
+        this.outputSummaryInformation = outputDocumentInformation;
+    }
+
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;

 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;

 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFOldDocument;
@ -49,13 +53,29 @@ public final class Word6Extractor extends POIOLE2TextExtractor {

    /**
     * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
     */
-	public Word6Extractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
+    public Word6Extractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( fs.getRoot() );
    }
-	public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-	    this(new HWPFOldDocument(dir,fs));
+
+    /**
+     * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public Word6Extractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFOldDocument( dir ) );
    }

 	/**
@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
     * Get the text from the word file, as an array with one String
     *  per paragraph
     */
+	@Deprecated
 	public String[] getParagraphText() {
 	    String[] ret;

@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
 	    return ret;
 	}

-    public String getText() {
+    public String getText()
+    {
+        try
+        {
+            WordToTextConverter wordToTextConverter = new WordToTextConverter();
+            wordToTextConverter.processDocument( doc );
+            return wordToTextConverter.getText();
+        }
+        catch ( Exception exc )
+        {
+            // fall-back
            StringBuffer text = new StringBuffer();

-        for(String t : getParagraphText()) {
+            for ( String t : getParagraphText() )
+            {
                text.append( t );
            }

            return text.toString();
        }
    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;

+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
@ -34,53 +37,75 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
 * Class to extract the text from a Word Document.
 * 
- * You should use either getParagraphText() or getText() unless
- *  you have a strong reason otherwise.
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
 * 
 * @author Nick Burch
 */
-public final class WordExtractor extends POIOLE2TextExtractor {
-	private POIFSFileSystem fs;
+public final class WordExtractor extends POIOLE2TextExtractor
+{
    private HWPFDocument doc;

    /**
     * Create a new Word Extractor
-	 * @param is InputStream containing the word file
+     * 
+     * @param is
+     *            InputStream containing the word file
     */
-	public WordExtractor(InputStream is) throws IOException {
+    public WordExtractor( InputStream is ) throws IOException
+    {
        this( HWPFDocument.verifyAndBuildPOIFS( is ) );
    }

    /**
     * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
     */
-	public WordExtractor(POIFSFileSystem fs) throws IOException {
+    public WordExtractor( POIFSFileSystem fs ) throws IOException
+    {
        this( new HWPFDocument( fs ) );
-		this.fs = fs;
    }
-	public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HWPFDocument(dir, fs));
-		this.fs = fs;
+
+    /**
+     * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public WordExtractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFDocument( dir ) );
    }

    /**
     * Create a new Word Extractor
-	 * @param doc The HWPFDocument to extract from
+     * 
+     * @param doc
+     *            The HWPFDocument to extract from
     */
-	public WordExtractor(HWPFDocument doc) {
+    public WordExtractor( HWPFDocument doc )
+    {
        super( doc );
        this.doc = doc;
    }

    /**
-	 * Command line extractor, so people will stop moaning that
-	 *  they can't just run this.
+     * Command line extractor, so people will stop moaning that they can't just
+     * run this.
     */
-	public static void main(String[] args) throws IOException {
-		if(args.length == 0) {
+    public static void main( String[] args ) throws IOException
+    {
+        if ( args.length == 0 )
+        {
            System.err.println( "Use:" );
-			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
+            System.err
+                    .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
            System.exit( 1 );
        }

@ -91,18 +116,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
    }

    /**
-	 * Get the text from the word file, as an array with one String
-	 *  per paragraph
+     * Get the text from the word file, as an array with one String per
+     * paragraph
     */
-        public String[] getParagraphText() {
+    public String[] getParagraphText()
+    {
        String[] ret;

        // Extract using the model code
-                try {
+        try
+        {
            Range r = doc.getRange();

            ret = getParagraphText( r );
-                } catch (Exception e) {
+        }
+        catch ( Exception e )
+        {
            // Something's up with turning the text pieces into paragraphs
            // Fall back to ripping out the text pieces
            ret = new String[1];
@ -112,39 +141,46 @@ public final class WordExtractor extends POIOLE2TextExtractor {
        return ret;
    }

-        public String[] getFootnoteText() {
+    public String[] getFootnoteText()
+    {
        Range r = doc.getFootnoteRange();

        return getParagraphText( r );
    }

-        public String[] getMainTextboxText() {
+    public String[] getMainTextboxText()
+    {
        Range r = doc.getMainTextboxRange();

        return getParagraphText( r );
    }

-        public String[] getEndnoteText() {
+    public String[] getEndnoteText()
+    {
        Range r = doc.getEndnoteRange();

        return getParagraphText( r );
    }

-        public String[] getCommentsText() {
+    public String[] getCommentsText()
+    {
        Range r = doc.getCommentsRange();

        return getParagraphText( r );
    }

-        protected static String[] getParagraphText(Range r) {
+    protected static String[] getParagraphText( Range r )
+    {
        String[] ret;
        ret = new String[r.numParagraphs()];
-                for (int i = 0; i < ret.length; i++) {
+        for ( int i = 0; i < ret.length; i++ )
+        {
            Paragraph p = r.getParagraph( i );
            ret[i] = p.text();

            // Fix the line ending
-                        if (ret[i].endsWith("\r")) {
+            if ( ret[i].endsWith( "\r" ) )
+            {
                ret[i] = ret[i] + "\n";
            }
        }
@ -154,56 +190,71 @@ public final class WordExtractor extends POIOLE2TextExtractor {
    /**
     * Add the header/footer text, if it's not empty
     */
-	private void appendHeaderFooter(String text, StringBuffer out) {
+    private void appendHeaderFooter( String text, StringBuffer out )
+    {
        if ( text == null || text.length() == 0 )
            return;

        text = text.replace( '\r', '\n' );
-		if(! text.endsWith("\n")) {
+        if ( !text.endsWith( "\n" ) )
+        {
            out.append( text );
            out.append( '\n' );
            return;
        }
-		if(text.endsWith("\n\n")) {
+        if ( text.endsWith( "\n\n" ) )
+        {
            out.append( text.substring( 0, text.length() - 1 ) );
            return;
        }
        out.append( text );
        return;
    }
+
    /**
     * Grab the text from the headers
     */
-	public String getHeaderText() {
+    @Deprecated
+    public String getHeaderText()
+    {
        HeaderStories hs = new HeaderStories( doc );

        StringBuffer ret = new StringBuffer();
-		if(hs.getFirstHeader() != null) {
+        if ( hs.getFirstHeader() != null )
+        {
            appendHeaderFooter( hs.getFirstHeader(), ret );
        }
-		if(hs.getEvenHeader() != null) {
+        if ( hs.getEvenHeader() != null )
+        {
            appendHeaderFooter( hs.getEvenHeader(), ret );
        }
-		if(hs.getOddHeader() != null) {
+        if ( hs.getOddHeader() != null )
+        {
            appendHeaderFooter( hs.getOddHeader(), ret );
        }

        return ret.toString();
    }
+
    /**
     * Grab the text from the footers
     */
-	public String getFooterText() {
+    @Deprecated
+    public String getFooterText()
+    {
        HeaderStories hs = new HeaderStories( doc );

        StringBuffer ret = new StringBuffer();
-		if(hs.getFirstFooter() != null) {
+        if ( hs.getFirstFooter() != null )
+        {
            appendHeaderFooter( hs.getFirstFooter(), ret );
        }
-		if(hs.getEvenFooter() != null) {
+        if ( hs.getEvenFooter() != null )
+        {
            appendHeaderFooter( hs.getEvenFooter(), ret );
        }
-		if(hs.getOddFooter() != null) {
+        if ( hs.getOddFooter() != null )
+        {
            appendHeaderFooter( hs.getOddFooter(), ret );
        }

@ -211,18 +262,20 @@ public final class WordExtractor extends POIOLE2TextExtractor {
    }

    /**
-	 * Grab the text out of the text pieces. Might also include various
-	 *  bits of crud, but will work in cases where the text piece -> paragraph
-	 *  mapping is broken. Fast too.
+     * Grab the text out of the text pieces. Might also include various bits of
+     * crud, but will work in cases where the text piece -> paragraph mapping is
+     * broken. Fast too.
     */
-	public String getTextFromPieces() {
+    public String getTextFromPieces()
+    {
        String text = doc.getDocumentText();

        // Fix line endings (Note - won't get all of them
        text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
        text = text.replaceAll( "\r\r", "\r\n\r\n" );

-    	if(text.endsWith("\r")) {
+        if ( text.endsWith( "\r" ) )
+        {
            text += "\n";
        }

@ -230,34 +283,53 @@ public final class WordExtractor extends POIOLE2TextExtractor {
    }

    /**
-	 * Grab the text, based on the paragraphs. Shouldn't include any crud,
-	 *  but slightly slower than getTextFromPieces().
+     * Grab the text, based on the WordToTextConverter. Shouldn't include any
+     * crud, but slower than getTextFromPieces().
     */
-	public String getText() {
-	   StringBuffer ret = new StringBuffer();
+    public String getText()
+    {
+        try
+        {
+            final StringWriter stringWriter = new StringWriter();
+            @SuppressWarnings( "unused" )
+            WordToTextConverter wordToTextConverter = new WordToTextConverter()
+            {
+                {
+                    HeaderStories hs = new HeaderStories( doc );

-	   ret.append(getHeaderText());
+                    if ( hs.getFirstHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+                    if ( hs.getEvenHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+                    if ( hs.getOddHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getOddHeaderSubrange() );

-	   ArrayList<String> text = new ArrayList<String>();
-	   text.addAll(Arrays.asList(getParagraphText()));
-	   text.addAll(Arrays.asList(getMainTextboxText()));
-	   text.addAll(Arrays.asList(getFootnoteText()));
-	   text.addAll(Arrays.asList(getEndnoteText()));
+                    processDocument( doc );
+                    processDocumentPart( doc, doc.getMainTextboxRange() );

-	   for(String p : text) {
-	      ret.append(p);
+                    if ( hs.getFirstFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstFooterSubrange() );
+                    if ( hs.getEvenFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenFooterSubrange() );
+                    if ( hs.getOddFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getOddFooterSubrange() );
+
+                    stringWriter.append( getText() );
+                }
+            };
+            return stringWriter.toString();
+        }
+        catch ( Exception exc )
+        {
+            throw new RuntimeException( exc );
        }
-
-	   ret.append(getFooterText());
-
-	   return ret.toString();
    }

    /**
-	 * Removes any fields (eg macros, page markers etc)
-	 *  from the string.
+     * Removes any fields (eg macros, page markers etc) from the string.
     */
-	public static String stripFields(String text) {
+    public static String stripFields( String text )
+    {
        return Range.stripFields( text );
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
@ -17,17 +17,23 @@ public interface Field
     */
    int getFieldStartOffset();

+    CharacterRun getMarkEndCharacterRun( Range parent );
+
    /**
     * @return character position of end field mark
     */
    int getMarkEndOffset();

+    CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
    /**
     * @return character position of separator field mark (if present,
     *         {@link NullPointerException} otherwise)
     */
    int getMarkSeparatorOffset();

+    CharacterRun getMarkStartCharacterRun( Range parent );
+
    /**
     * @return character position of start field mark
     */
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
@ -112,6 +112,12 @@ class FieldImpl implements Field
        return startPlex.getFcStart();
    }

+    public CharacterRun getMarkEndCharacterRun( Range parent )
+    {
+        return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+                .getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of end field mark
     */
@ -120,6 +126,15 @@ class FieldImpl implements Field
        return endPlex.getFcStart();
    }

+    public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+    {
+        if ( !hasSeparator() )
+            return null;
+
+        return new Range( getMarkSeparatorOffset(),
+                getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of separator field mark (if present,
     *         {@link NullPointerException} otherwise)
@ -129,6 +144,12 @@ class FieldImpl implements Field
        return separatorPlex.getFcStart();
    }

+    public CharacterRun getMarkStartCharacterRun( Range parent )
+    {
+        return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+                parent ).getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of start field mark
     */
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@ -82,35 +82,96 @@ public final class HeaderStories {
                fib.getPlcfHddSize(), 0 );
    }

-	public String getFootnoteSeparator() {
+    @Deprecated
+    public String getFootnoteSeparator()
+    {
        return getAt( 0 );
    }
-	public String getFootnoteContSeparator() {
+
+    @Deprecated
+    public String getFootnoteContSeparator()
+    {
        return getAt( 1 );
    }
-	public String getFootnoteContNote() {
+
+    @Deprecated
+    public String getFootnoteContNote()
+    {
        return getAt( 2 );
    }
-	public String getEndnoteSeparator() {
+
+    @Deprecated
+    public String getEndnoteSeparator()
+    {
        return getAt( 3 );
    }
-	public String getEndnoteContSeparator() {
+
+    @Deprecated
+    public String getEndnoteContSeparator()
+    {
        return getAt( 4 );
    }
-	public String getEndnoteContNote() {
+
+    @Deprecated
+    public String getEndnoteContNote()
+    {
        return getAt( 5 );
    }

+    public Range getFootnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 0 );
+    }

+    public Range getFootnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 1 );
+    }
+
+    public Range getFootnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 2 );
+    }
+
+    public Range getEndnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 3 );
+    }
+
+    public Range getEndnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 4 );
+    }
+
+    public Range getEndnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 5 );
+    }
+
+	@Deprecated
 	public String getEvenHeader() {
 		return getAt(6+0);
 	}
+    @Deprecated
 	public String getOddHeader() {
 		return getAt(6+1);
 	}
+    @Deprecated
 	public String getFirstHeader() {
 		return getAt(6+4);
 	}
+	
+
+    public Range getEvenHeaderSubrange() {
+        return getSubrangeAt(6+0);
+    }
+    public Range getOddHeaderSubrange() {
+        return getSubrangeAt(6+1);
+    }
+    public Range getFirstHeaderSubrange() {
+        return getSubrangeAt(6+4);
+    }
+    
 	/**
 	 * Returns the correct, defined header for the given
 	 *  one based page
@ -135,16 +196,39 @@ public final class HeaderStories {
 		return getOddHeader();
 	}

-
-	public String getEvenFooter() {
+	@Deprecated
+    public String getEvenFooter()
+    {
        return getAt( 6 + 2 );
    }
-	public String getOddFooter() {
+
+    @Deprecated
+    public String getOddFooter()
+    {
        return getAt( 6 + 3 );
    }
-	public String getFirstFooter() {
+
+    @Deprecated
+    public String getFirstFooter()
+    {
        return getAt( 6 + 5 );
    }
+
+    public Range getEvenFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 2 );
+    }
+
+    public Range getOddFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 3 );
+    }
+
+    public Range getFirstFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 5 );
+    }
+
 	/**
 	 * Returns the correct, defined footer for the given
 	 *  one based page
@ -174,6 +258,7 @@ public final class HeaderStories {
 	 * Get the string that's pointed to by the
 	 *  given plcfHdd index
 	 */
+    @Deprecated
 	private String getAt(int plcfHddIndex) {
 		if(plcfHdd == null) return null;

@ -209,6 +294,32 @@ public final class HeaderStories {
 		return text;
 	}

+    private Range getSubrangeAt( int plcfHddIndex )
+    {
+        if ( plcfHdd == null )
+            return null;
+
+        GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+        if ( prop.getStart() == prop.getEnd() )
+        {
+            // Empty story
+            return null;
+        }
+        if ( prop.getEnd() < prop.getStart() )
+        {
+            // Broken properties?
+            return null;
+        }
+
+        final int headersLength = headerStories.getEndOffset()
+                - headerStories.getStartOffset();
+        int start = Math.min( prop.getStart(), headersLength );
+        int end = Math.min( prop.getEnd(), headersLength );
+
+        return new Range( headerStories.getStartOffset() + start,
+                headerStories.getStartOffset() + end, headerStories );
+    }
+
 	public Range getRange() {
 		return headerStories;
 	}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
@ -0,0 +1,34 @@
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+    private DirectoryEntry _objectPool;
+
+    public ObjectPoolImpl( DirectoryEntry _objectPool )
+    {
+        super();
+        this._objectPool = _objectPool;
+    }
+
+    public Entry getObjectById( String objId )
+    {
+        if ( _objectPool == null )
+            return null;
+
+        try
+        {
+            return _objectPool.getEntry( objId );
+        }
+        catch ( FileNotFoundException exc )
+        {
+            return null;
+        }
+    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
@ -0,0 +1,8 @@
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+    public Entry getObjectById( String objId );
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
 import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;

 /**
 * This class is the central class of the HWPF object model. All properties that
@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
 */
 public class Range { // TODO -instantiable superclass

+    private POILogger logger = POILogFactory.getLogger( Range.class );
+    
 	public static final int TYPE_PARAGRAPH = 0;
 	public static final int TYPE_CHARACTER = 1;
 	public static final int TYPE_SECTION = 2;
@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
        initAll();
        if ( tableEndInclusive >= this._parEnd )
        {
-            throw new ArrayIndexOutOfBoundsException(
-                    "The table's bounds fall outside of this Range" );
+            logger.log( POILogger.WARN, "The table's bounds ", "["
+                    + this._parStart + "; " + tableEndInclusive + ")",
+                    " fall outside of this Range paragraphs numbers ", "["
+                            + this._parStart + "; " + this._parEnd + ")" );
        }
+
        if ( tableEndInclusive < 0 )
        {
            throw new ArrayIndexOutOfBoundsException(
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
@ -0,0 +1,22 @@
+package org.apache.poi.hwpf.converter;
+
+import junit.framework.TestCase;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+public class TestWordToTextConverter extends TestCase
+{
+
+    /**
+     * [FAILING] Bug 47731 - Word Extractor considers text copied from some
+     * website as an embedded object
+     */
+    public void testBug47731() throws Exception
+    {
+        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
+        String foundText = WordToTextConverter.getText( doc );
+
+        assertTrue( foundText
+                .contains( "Soak the rice in water for three to four hours" ) );
+    }
+}
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * @author Nick Burch (nick at torchbox dot com)
 */
 public final class TestWordExtractor extends TestCase {
+
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
 	private String[] p_text1 = new String[] {
 			"This is a simple word document\r\n",
 			"\r\n",
@ -109,9 +119,11 @@ public final class TestWordExtractor extends TestCase {

        // For the 2nd, should give similar answers for
        // the two methods, differing only in line endings
-		assertEquals(
-		      extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
-		      extractor2.getText().replaceAll("[\\r\\n]", ""));
+
+        // nope, they must have different results, because of garbage
+        // assertEquals(
+        // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+        // extractor2.getText().replaceAll("[\\r\\n]", ""));
    }

 	/**
@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
       
       // Open directly 
       for(DirectoryNode dir : files) {
-          WordExtractor extractor = new WordExtractor(dir, null);
+          WordExtractor extractor = new WordExtractor(dir);
          assertEquals(p_text1_block, extractor.getText());
       }

--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
 public class TestBugs extends TestCase
 {

+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
    private static void assertTableStructures( Range expected, Range actual )
    {
        assertEquals( expected.numParagraphs(), actual.numParagraphs() );