Add Word-to-Text converter and use it as replacement for WordExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-08-09 12:38:52 +00:00
parent b47081db42
commit 1d9900c184
26 changed files with 1117 additions and 488 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta4" date="2011-??-??">
<action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
<action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
<action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
<action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>

View File

@ -19,6 +19,7 @@ package org.apache.poi;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -61,11 +62,19 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
return new HPSFPropertiesExtractor(this);
}
public DirectoryEntry getRoot()
{
return document.directory;
}
/**
* Return the underlying POIFS FileSystem of
* this document.
* Return the underlying POIFS FileSystem of this document.
*
* @deprecated Use {@link #getRoot()} instead
*/
public POIFSFileSystem getFileSystem() {
@Deprecated
public POIFSFileSystem getFileSystem()
{
return document.directory.getFileSystem();
}
}

View File

@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
private DirectoryNode _dir;
private POIFSFileSystem _fs;
boolean _includeSheetNames = true;
boolean _formulasNotResults = false;
public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
/**
* @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
*/
@Deprecated
@SuppressWarnings( "unused" )
public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
{
this( dir );
}
public EventBasedExcelExtractor( DirectoryNode dir )
{
super( null );
_dir = dir;
_fs = fs;
}
public EventBasedExcelExtractor(POIFSFileSystem fs) {
this(fs.getRoot(), fs);
this(fs.getRoot());
}
/**
@ -79,7 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
* this document.
*/
public POIFSFileSystem getFileSystem() {
return _fs;
return _dir.getFileSystem();
}
/**

View File

@ -24,7 +24,6 @@ import java.io.InputStream;
import java.io.PrintStream;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.ss.formula.eval.ErrorEval;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFComment;
@ -35,6 +34,7 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.formula.eval.ErrorEval;
import org.apache.poi.ss.usermodel.HeaderFooter;
/**
@ -66,10 +66,18 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
_formatter = new HSSFDataFormatter();
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
this(fs.getRoot());
}
/**
* @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
*/
@Deprecated
@SuppressWarnings( "unused" )
public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HSSFWorkbook(dir, fs, true));
this( dir );
}
public ExcelExtractor(DirectoryNode dir) throws IOException {
this(new HSSFWorkbook(dir, true));
}
private static final class CommandParseException extends Exception {

View File

@ -19,9 +19,10 @@
package org.apache.poi.poifs.filesystem;
import java.io.*;
import java.util.*;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.hpsf.ClassID;
@ -67,6 +68,12 @@ public interface DirectoryEntry
public int getEntryCount();
/**
* Checks if entry with specified name present
*/
public boolean hasEntry( final String name );
/**
* get a specified Entry by name
*

View File

@ -342,6 +342,11 @@ public class DirectoryNode
return _entries.size();
}
public boolean hasEntry( String name )
{
return name != null && _byname.containsKey( name );
}
/**
* get a specified Entry by name
*

View File

@ -193,59 +193,73 @@ public class ExtractorFactory {
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
/**
* @deprecated Use {@link #createExtractor(DirectoryNode)} instead
*/
@Deprecated
@SuppressWarnings("unused")
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
throws IOException, InvalidFormatException, OpenXML4JException, XmlException
{
return createExtractor(poifsDir);
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
InvalidFormatException, OpenXML4JException, XmlException
{
// Look for certain entries in the stream, to figure it
// out from
if (poifsDir.hasEntry("Workbook")) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
return new ExcelExtractor(poifsDir);
}
if (poifsDir.hasEntry("WordDocument")) {
// Old or new style word document?
try {
return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
return new Word6Extractor(poifsDir);
}
}
if (poifsDir.hasEntry("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir);
}
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
|| poifsDir.hasEntry("__substg1.0_0047001E")
|| poifsDir.hasEntry("__substg1.0_0047001F")
|| poifsDir.hasEntry("__substg1.0_0037001E")
|| poifsDir.hasEntry("__substg1.0_0037001F"))
{
return new OutlookTextExtactor(poifsDir);
}
for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
Entry entry = entries.next();
if(entry.getName().equals("Workbook")) {
if(getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir, fs);
} else {
return new ExcelExtractor(poifsDir, fs);
}
}
if(entry.getName().equals("WordDocument")) {
// Old or new style word document?
try {
return new WordExtractor(poifsDir, fs);
} catch(OldWordFileFormatException e) {
return new Word6Extractor(poifsDir, fs);
}
}
if(entry.getName().equals("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir, fs);
}
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
if(entry.getName().equals("Quill")) {
return new PublisherTextExtractor(poifsDir, fs);
}
if(
entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0047001F") ||
entry.getName().equals("__substg1.0_0037001E") ||
entry.getName().equals("__substg1.0_0037001F")
) {
return new OutlookTextExtactor(poifsDir, fs);
}
if (entry.getName().equals("Package")) {
OPCPackage pkg = OPCPackage.open(
poifsDir.createDocumentInputStream(entry.getName())
);
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
/**
* Returns an array of text extractors, one for each of
* the embeded documents in the file (if there are any).
@ -260,14 +274,14 @@ public class ExtractorFactory {
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
DirectoryEntry root = ext.getRoot();
if(root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = fs.getRoot().getEntries();
Iterator<Entry> it = root.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
if(entry.getName().startsWith("MBD")) {
@ -278,7 +292,7 @@ public class ExtractorFactory {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry)
fs.getRoot().getEntry("ObjectPool");
root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
@ -314,7 +328,7 @@ public class ExtractorFactory {
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for(int i=0; i<dirs.size(); i++) {
e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
(DirectoryNode)dirs.get(i)
) );
}
for(int i=0; i<nonPOIFS.size(); i++) {

View File

@ -23,6 +23,8 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
import org.apache.poi.hwpf.model.BookmarksTables;
import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.CPSplitCalculator;
@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
* @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
*/
@Deprecated
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
this(directory);

View File

@ -17,10 +17,17 @@
package org.apache.poi.hwpf;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import org.apache.poi.hwpf.usermodel.ObjectsPool;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDocument;
import org.apache.poi.hwpf.model.CHPBinTable;
@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
*/
public abstract class HWPFDocumentCore extends POIDocument
{
/** Holds OLE2 objects */
protected ObjectPoolImpl _objectPool;
/** The FIB */
protected FileInformationBlock _fib;
@ -148,6 +158,20 @@ public abstract class HWPFDocumentCore extends POIDocument
if(_fib.isFEncrypted()) {
throw new EncryptedDocumentException("Cannot process encrypted word files!");
}
{
DirectoryEntry objectPoolEntry;
try
{
objectPoolEntry = (DirectoryEntry) directory
.getEntry( "ObjectPool" );
}
catch ( FileNotFoundException exc )
{
objectPoolEntry = directory.createDirectory( "ObjectPool" );
}
_objectPool = new ObjectPoolImpl( objectPoolEntry );
}
}
/**
@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
return _fib;
}
public ObjectsPool getObjectsPool()
{
return _objectPool;
}
public abstract TextPieceTable getTextTable();
}

View File

@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
this(fs.getRoot());
}
@Deprecated
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
throws IOException {
this(directory);

View File

@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.util.Beta;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@ -56,6 +57,32 @@ import org.w3c.dom.Element;
@Beta
public abstract class AbstractWordConverter
{
private static final class Structure implements Comparable<Structure>
{
final int end;
final int start;
final Object structure;
Structure( Bookmark bookmark )
{
this.start = bookmark.getStart();
this.end = bookmark.getEnd();
this.structure = bookmark;
}
Structure( Field field )
{
this.start = field.getFieldStartOffset();
this.end = field.getFieldEndOffset();
this.structure = field;
}
public int compareTo( Structure o )
{
return start < o.start ? -1 : start == o.start ? 0 : 1;
}
}
private static final byte BEL_MARK = 7;
private static final byte FIELD_BEGIN_MARK = 19;
@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
processDrawnObject( doc, characterRun, block );
continue;
}
if ( characterRun.isOle2()
&& ( wordDocument instanceof HWPFDocument ) )
{
HWPFDocument doc = (HWPFDocument) wordDocument;
processOle2( doc, characterRun, block );
continue;
}
}
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
CharacterRun characterRun, OfficeDrawing officeDrawing,
String path, Element block );
protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange );
protected abstract void processEndnoteAutonumbered(
HWPFDocument wordDocument, int noteIndex, Element block,
Range endnoteTextRange );
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
protected void processField( HWPFDocument wordDocument, Range parentRange,
int currentTableLevel, Field field, Element currentBlock )
{
switch ( field.getType() )
@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
if ( matcher.find() )
{
String pageref = matcher.group( 1 );
processPageref( hwpfDocument, currentBlock,
processPageref( wordDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, pageref );
return;
@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
}
break;
}
case 58: // Embedded Object
{
if ( !field.hasSeparator() )
{
logger.log( POILogger.WARN, parentRange + " contains " + field
+ " with 'Embedded Object' but without separator mark" );
return;
}
CharacterRun separator = field
.getMarkSeparatorCharacterRun( parentRange );
if ( separator.isOle2() )
{
// the only supported so far
boolean processed = processOle2( wordDocument, separator,
currentBlock );
// if we didn't output OLE - output field value
if ( !processed )
{
processCharacters( wordDocument, currentTableLevel,
field.secondSubrange( parentRange ), currentBlock );
}
return;
}
break;
}
case 88: // hyperlink
{
final Range firstSubrange = field.firstSubrange( parentRange );
@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
if ( matcher.find() )
{
String hyperlink = matcher.group( 1 );
processHyperlink( hwpfDocument, currentBlock,
processHyperlink( wordDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, hyperlink );
return;
@ -665,12 +730,13 @@ public abstract class AbstractWordConverter
logger.log( POILogger.WARN, parentRange + " contains " + field
+ " with unsupported type or format" );
processCharacters( hwpfDocument, currentTableLevel,
processCharacters( wordDocument, currentTableLevel,
field.secondSubrange( parentRange ), currentBlock );
}
protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange );
protected abstract void processFootnoteAutonumbered(
HWPFDocument wordDocument, int noteIndex, Element block,
Range footnoteTextRange );
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
}
}
private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
Element block )
{
Entry entry = doc.getObjectsPool().getObjectById(
"_" + characterRun.getPicOffset() );
if ( entry == null )
{
logger.log( POILogger.WARN, "Referenced OLE2 object '",
Integer.valueOf( characterRun.getPicOffset() ),
"' not found in ObjectPool" );
return false;
}
try
{
return processOle2( doc, block, entry );
}
catch ( Exception exc )
{
logger.log( POILogger.WARN,
"Unable to convert internal OLE2 object '",
Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
exc );
return false;
}
}
@SuppressWarnings( "unused" )
protected boolean processOle2( HWPFDocument wordDocument, Element block,
Entry entry ) throws Exception
{
return false;
}
protected abstract void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String pageref );
@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
return endMark;
}
private static final class Structure implements Comparable<Structure>
{
final int end;
final int start;
final Object structure;
Structure( Bookmark bookmark )
{
this.start = bookmark.getStart();
this.end = bookmark.getEnd();
this.structure = bookmark;
}
Structure( Field field )
{
this.start = field.getFieldStartOffset();
this.end = field.getFieldEndOffset();
this.structure = field;
}
public int compareTo( Structure o )
{
return start < o.start ? -1 : start == o.start ? 0 : 1;
}
}
}

View File

@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
@ -422,6 +423,19 @@ public class AbstractWordUtils
return !isEmpty( str );
}
public static HWPFDocumentCore loadDoc( final DirectoryNode root )
throws IOException
{
try
{
return new HWPFDocument( root );
}
catch ( OldWordFileFormatException exc )
{
return new HWPFOldDocument( root );
}
}
public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
{
final FileInputStream istream = new FileInputStream( docFile );
@ -438,16 +452,13 @@ public class AbstractWordUtils
public static HWPFDocumentCore loadDoc( InputStream inputStream )
throws IOException
{
final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
.verifyAndBuildPOIFS( inputStream );
try
{
return new HWPFDocument( poifsFileSystem );
return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
}
catch ( OldWordFileFormatException exc )
public static HWPFDocumentCore loadDoc(
final POIFSFileSystem poifsFileSystem ) throws IOException
{
return new HWPFOldDocument( poifsFileSystem );
}
return loadDoc( poifsFileSystem.getRoot() );
}
static String substringBeforeLast( String str, String separator )

View File

@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
}
@Override
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
Element block, Range endnoteTextRange )
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange )
{
final String textIndex = String.valueOf( internalLinkCounter
.incrementAndGet() );
@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
setId( backwardLink, forwardLinkName );
endnote.appendChild( backwardLink );
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
endnote );
WordToFoUtils.compactInlines( endnote );
this.endnotes.add( endnote );

View File

@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
@Beta
public class WordToHtmlConverter extends AbstractWordConverter
{
/**
* Holds properties values, applied to current <tt>p</tt> element. Those
* properties shall not be doubled in children <tt>span</tt> elements.
@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
}
@Override
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
Element block, Range endnoteTextRange )
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange )
{
processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
processNoteAutonumbered( wordDocument, "end", noteIndex, block,
endnoteTextRange );
}
@Override

View File

@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;
import java.io.File;
import java.io.FileWriter;
import java.io.StringWriter;
import java.lang.reflect.Method;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.util.Beta;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@ -33,6 +39,29 @@ import org.w3c.dom.Element;
public class WordToTextConverter extends AbstractWordConverter
{
public static String getText( DirectoryNode root ) throws Exception
{
final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
return getText( wordDocument );
}
public static String getText( File docFile ) throws Exception
{
final HWPFDocumentCore wordDocument = AbstractWordUtils
.loadDoc( docFile );
return getText( wordDocument );
}
public static String getText( final HWPFDocumentCore wordDocument )
throws Exception
{
WordToTextConverter wordToTextConverter = new WordToTextConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToTextConverter.processDocument( wordDocument );
return wordToTextConverter.getText();
}
/**
* Java main() interface to interact with {@link WordToTextConverter}
*
@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter
private Element notes = null;
private boolean outputSummaryInformation = false;
private final TextDocumentFacade textDocumentFacade;
/**
* Creates new instance of {@link WordToTextConverter}. Can be used for
* output several {@link HWPFDocument}s into single text document.
*
* @throws ParserConfigurationException
* if an internal {@link DocumentBuilder} cannot be created
*/
public WordToTextConverter() throws ParserConfigurationException
{
this.textDocumentFacade = new TextDocumentFacade(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
}
/**
* Creates new instance of {@link WordToTextConverter}. Can be used for
* output several {@link HWPFDocument}s into single text document.
@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
return textDocumentFacade.getDocument();
}
public String getText() throws Exception
{
StringWriter stringWriter = new StringWriter();
DOMSource domSource = new DOMSource( getDocument() );
StreamResult streamResult = new StreamResult( stringWriter );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "no" );
serializer.setOutputProperty( OutputKeys.METHOD, "text" );
serializer.transform( domSource, streamResult );
return stringWriter.toString();
}
public boolean isOutputSummaryInformation()
{
return outputSummaryInformation;
}
@Override
protected void outputCharacters( Element block, CharacterRun characterRun,
String text )
@ -137,6 +204,8 @@ public class WordToTextConverter extends AbstractWordConverter
@Override
protected void processDocumentInformation(
SummaryInformation summaryInformation )
{
if ( isOutputSummaryInformation() )
{
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
textDocumentFacade.setTitle( summaryInformation.getTitle() );
@ -144,12 +213,16 @@ public class WordToTextConverter extends AbstractWordConverter
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
textDocumentFacade
.addDescription( summaryInformation.getComments() );
if ( AbstractWordUtils
.isNotEmpty( summaryInformation.getComments() ) )
textDocumentFacade.addDescription( summaryInformation
.getComments() );
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
if ( AbstractWordUtils
.isNotEmpty( summaryInformation.getKeywords() ) )
textDocumentFacade.addKeywords( summaryInformation
.getKeywords() );
}
}
@Override
@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
note.appendChild( textDocumentFacade.createText( "\n" ) );
}
@Override
protected boolean processOle2( HWPFDocument wordDocument, Element block,
Entry entry ) throws Exception
{
if ( !( entry instanceof DirectoryNode ) )
return false;
DirectoryNode directoryNode = (DirectoryNode) entry;
// even if no ExtractorFactory in classpath
if ( directoryNode.hasEntry( "WordDocument" ) )
{
String text = WordToTextConverter.getText( (DirectoryNode) entry );
block.appendChild( textDocumentFacade
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+ UNICODECHAR_ZERO_WIDTH_SPACE ) );
return true;
}
try
{
Class<?> cls = Class
.forName( "org.apache.poi.extractor.ExtractorFactory" );
Method createExtractor = cls.getMethod( "createExtractor",
DirectoryNode.class );
Object extractor = createExtractor.invoke( null, directoryNode );
Method getText = extractor.getClass().getMethod( "getText" );
String text = (String) getText.invoke( extractor );
block.appendChild( textDocumentFacade
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+ UNICODECHAR_ZERO_WIDTH_SPACE ) );
return true;
}
catch ( ClassNotFoundException exc )
{
// no extractor in classpath
}
return false;
}
@Override
protected void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
textDocumentFacade.body.appendChild( sectionElement );
}
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
protected void processTable( HWPFDocumentCore wordDocument, Element flow,
Table table )
{
final int tableRows = table.numRows();
@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
tableCellElement.appendChild( textDocumentFacade
.createText( "\t" ) );
processParagraphes( hwpfDocument, tableCellElement, tableCell,
table.getTableLevel() );
processCharacters( wordDocument, table.getTableLevel(),
tableCell, tableCellElement );
tableRowElement.appendChild( tableCellElement );
}
@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
}
}
public void setOutputSummaryInformation( boolean outputDocumentInformation )
{
this.outputSummaryInformation = outputDocumentInformation;
}
}

View File

@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFOldDocument;
@ -49,13 +53,29 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
/**
* Create a new Word Extractor
* @param fs POIFSFileSystem containing the word file
*
* @param fs
* POIFSFileSystem containing the word file
*/
public Word6Extractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
public Word6Extractor( POIFSFileSystem fs ) throws IOException
{
this( fs.getRoot() );
}
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HWPFOldDocument(dir,fs));
/**
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
*/
@Deprecated
@SuppressWarnings( "unused" )
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
throws IOException
{
this( dir );
}
public Word6Extractor( DirectoryNode dir ) throws IOException
{
this( new HWPFOldDocument( dir ) );
}
/**
@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
* Get the text from the word file, as an array with one String
* per paragraph
*/
@Deprecated
public String[] getParagraphText() {
String[] ret;
@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
return ret;
}
public String getText() {
public String getText()
{
try
{
WordToTextConverter wordToTextConverter = new WordToTextConverter();
wordToTextConverter.processDocument( doc );
return wordToTextConverter.getText();
}
catch ( Exception exc )
{
// fall-back
StringBuffer text = new StringBuffer();
for(String t : getParagraphText()) {
for ( String t : getParagraphText() )
{
text.append( t );
}
return text.toString();
}
}
}

View File

@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.HeaderStories;
@ -34,53 +37,75 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Class to extract the text from a Word Document.
*
* You should use either getParagraphText() or getText() unless
* you have a strong reason otherwise.
* You should use either getParagraphText() or getText() unless you have a
* strong reason otherwise.
*
* @author Nick Burch
*/
public final class WordExtractor extends POIOLE2TextExtractor {
private POIFSFileSystem fs;
public final class WordExtractor extends POIOLE2TextExtractor
{
private HWPFDocument doc;
/**
* Create a new Word Extractor
* @param is InputStream containing the word file
*
* @param is
* InputStream containing the word file
*/
public WordExtractor(InputStream is) throws IOException {
public WordExtractor( InputStream is ) throws IOException
{
this( HWPFDocument.verifyAndBuildPOIFS( is ) );
}
/**
* Create a new Word Extractor
* @param fs POIFSFileSystem containing the word file
*
* @param fs
* POIFSFileSystem containing the word file
*/
public WordExtractor(POIFSFileSystem fs) throws IOException {
public WordExtractor( POIFSFileSystem fs ) throws IOException
{
this( new HWPFDocument( fs ) );
this.fs = fs;
}
public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HWPFDocument(dir, fs));
this.fs = fs;
/**
* @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
*/
@Deprecated
public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
throws IOException
{
this( dir );
}
public WordExtractor( DirectoryNode dir ) throws IOException
{
this( new HWPFDocument( dir ) );
}
/**
* Create a new Word Extractor
* @param doc The HWPFDocument to extract from
*
* @param doc
* The HWPFDocument to extract from
*/
public WordExtractor(HWPFDocument doc) {
public WordExtractor( HWPFDocument doc )
{
super( doc );
this.doc = doc;
}
/**
* Command line extractor, so people will stop moaning that
* they can't just run this.
* Command line extractor, so people will stop moaning that they can't just
* run this.
*/
public static void main(String[] args) throws IOException {
if(args.length == 0) {
public static void main( String[] args ) throws IOException
{
if ( args.length == 0 )
{
System.err.println( "Use:" );
System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
System.err
.println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
System.exit( 1 );
}
@ -91,18 +116,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
}
/**
* Get the text from the word file, as an array with one String
* per paragraph
* Get the text from the word file, as an array with one String per
* paragraph
*/
public String[] getParagraphText() {
public String[] getParagraphText()
{
String[] ret;
// Extract using the model code
try {
try
{
Range r = doc.getRange();
ret = getParagraphText( r );
} catch (Exception e) {
}
catch ( Exception e )
{
// Something's up with turning the text pieces into paragraphs
// Fall back to ripping out the text pieces
ret = new String[1];
@ -112,39 +141,46 @@ public final class WordExtractor extends POIOLE2TextExtractor {
return ret;
}
public String[] getFootnoteText() {
public String[] getFootnoteText()
{
Range r = doc.getFootnoteRange();
return getParagraphText( r );
}
public String[] getMainTextboxText() {
public String[] getMainTextboxText()
{
Range r = doc.getMainTextboxRange();
return getParagraphText( r );
}
public String[] getEndnoteText() {
public String[] getEndnoteText()
{
Range r = doc.getEndnoteRange();
return getParagraphText( r );
}
public String[] getCommentsText() {
public String[] getCommentsText()
{
Range r = doc.getCommentsRange();
return getParagraphText( r );
}
protected static String[] getParagraphText(Range r) {
protected static String[] getParagraphText( Range r )
{
String[] ret;
ret = new String[r.numParagraphs()];
for (int i = 0; i < ret.length; i++) {
for ( int i = 0; i < ret.length; i++ )
{
Paragraph p = r.getParagraph( i );
ret[i] = p.text();
// Fix the line ending
if (ret[i].endsWith("\r")) {
if ( ret[i].endsWith( "\r" ) )
{
ret[i] = ret[i] + "\n";
}
}
@ -154,56 +190,71 @@ public final class WordExtractor extends POIOLE2TextExtractor {
/**
* Add the header/footer text, if it's not empty
*/
private void appendHeaderFooter(String text, StringBuffer out) {
private void appendHeaderFooter( String text, StringBuffer out )
{
if ( text == null || text.length() == 0 )
return;
text = text.replace( '\r', '\n' );
if(! text.endsWith("\n")) {
if ( !text.endsWith( "\n" ) )
{
out.append( text );
out.append( '\n' );
return;
}
if(text.endsWith("\n\n")) {
if ( text.endsWith( "\n\n" ) )
{
out.append( text.substring( 0, text.length() - 1 ) );
return;
}
out.append( text );
return;
}
/**
* Grab the text from the headers
*/
public String getHeaderText() {
@Deprecated
public String getHeaderText()
{
HeaderStories hs = new HeaderStories( doc );
StringBuffer ret = new StringBuffer();
if(hs.getFirstHeader() != null) {
if ( hs.getFirstHeader() != null )
{
appendHeaderFooter( hs.getFirstHeader(), ret );
}
if(hs.getEvenHeader() != null) {
if ( hs.getEvenHeader() != null )
{
appendHeaderFooter( hs.getEvenHeader(), ret );
}
if(hs.getOddHeader() != null) {
if ( hs.getOddHeader() != null )
{
appendHeaderFooter( hs.getOddHeader(), ret );
}
return ret.toString();
}
/**
* Grab the text from the footers
*/
public String getFooterText() {
@Deprecated
public String getFooterText()
{
HeaderStories hs = new HeaderStories( doc );
StringBuffer ret = new StringBuffer();
if(hs.getFirstFooter() != null) {
if ( hs.getFirstFooter() != null )
{
appendHeaderFooter( hs.getFirstFooter(), ret );
}
if(hs.getEvenFooter() != null) {
if ( hs.getEvenFooter() != null )
{
appendHeaderFooter( hs.getEvenFooter(), ret );
}
if(hs.getOddFooter() != null) {
if ( hs.getOddFooter() != null )
{
appendHeaderFooter( hs.getOddFooter(), ret );
}
@ -211,18 +262,20 @@ public final class WordExtractor extends POIOLE2TextExtractor {
}
/**
* Grab the text out of the text pieces. Might also include various
* bits of crud, but will work in cases where the text piece -> paragraph
* mapping is broken. Fast too.
* Grab the text out of the text pieces. Might also include various bits of
* crud, but will work in cases where the text piece -> paragraph mapping is
* broken. Fast too.
*/
public String getTextFromPieces() {
public String getTextFromPieces()
{
String text = doc.getDocumentText();
// Fix line endings (Note - won't get all of them
text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
text = text.replaceAll( "\r\r", "\r\n\r\n" );
if(text.endsWith("\r")) {
if ( text.endsWith( "\r" ) )
{
text += "\n";
}
@ -230,34 +283,53 @@ public final class WordExtractor extends POIOLE2TextExtractor {
}
/**
* Grab the text, based on the paragraphs. Shouldn't include any crud,
* but slightly slower than getTextFromPieces().
* Grab the text, based on the WordToTextConverter. Shouldn't include any
* crud, but slower than getTextFromPieces().
*/
public String getText() {
StringBuffer ret = new StringBuffer();
public String getText()
{
try
{
final StringWriter stringWriter = new StringWriter();
@SuppressWarnings( "unused" )
WordToTextConverter wordToTextConverter = new WordToTextConverter()
{
{
HeaderStories hs = new HeaderStories( doc );
ret.append(getHeaderText());
if ( hs.getFirstHeaderSubrange() != null )
processDocumentPart( doc, hs.getFirstHeaderSubrange() );
if ( hs.getEvenHeaderSubrange() != null )
processDocumentPart( doc, hs.getEvenHeaderSubrange() );
if ( hs.getOddHeaderSubrange() != null )
processDocumentPart( doc, hs.getOddHeaderSubrange() );
ArrayList<String> text = new ArrayList<String>();
text.addAll(Arrays.asList(getParagraphText()));
text.addAll(Arrays.asList(getMainTextboxText()));
text.addAll(Arrays.asList(getFootnoteText()));
text.addAll(Arrays.asList(getEndnoteText()));
processDocument( doc );
processDocumentPart( doc, doc.getMainTextboxRange() );
for(String p : text) {
ret.append(p);
if ( hs.getFirstFooterSubrange() != null )
processDocumentPart( doc, hs.getFirstFooterSubrange() );
if ( hs.getEvenFooterSubrange() != null )
processDocumentPart( doc, hs.getEvenFooterSubrange() );
if ( hs.getOddFooterSubrange() != null )
processDocumentPart( doc, hs.getOddFooterSubrange() );
stringWriter.append( getText() );
}
};
return stringWriter.toString();
}
catch ( Exception exc )
{
throw new RuntimeException( exc );
}
ret.append(getFooterText());
return ret.toString();
}
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
* Removes any fields (eg macros, page markers etc) from the string.
*/
public static String stripFields(String text) {
public static String stripFields( String text )
{
return Range.stripFields( text );
}
}

View File

@ -17,17 +17,23 @@ public interface Field
*/
int getFieldStartOffset();
CharacterRun getMarkEndCharacterRun( Range parent );
/**
* @return character position of end field mark
*/
int getMarkEndOffset();
CharacterRun getMarkSeparatorCharacterRun( Range parent );
/**
* @return character position of separator field mark (if present,
* {@link NullPointerException} otherwise)
*/
int getMarkSeparatorOffset();
CharacterRun getMarkStartCharacterRun( Range parent );
/**
* @return character position of start field mark
*/

View File

@ -112,6 +112,12 @@ class FieldImpl implements Field
return startPlex.getFcStart();
}
public CharacterRun getMarkEndCharacterRun( Range parent )
{
return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
.getCharacterRun( 0 );
}
/**
* @return character position of end field mark
*/
@ -120,6 +126,15 @@ class FieldImpl implements Field
return endPlex.getFcStart();
}
public CharacterRun getMarkSeparatorCharacterRun( Range parent )
{
if ( !hasSeparator() )
return null;
return new Range( getMarkSeparatorOffset(),
getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
}
/**
* @return character position of separator field mark (if present,
* {@link NullPointerException} otherwise)
@ -129,6 +144,12 @@ class FieldImpl implements Field
return separatorPlex.getFcStart();
}
public CharacterRun getMarkStartCharacterRun( Range parent )
{
return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
parent ).getCharacterRun( 0 );
}
/**
* @return character position of start field mark
*/

View File

@ -82,35 +82,96 @@ public final class HeaderStories {
fib.getPlcfHddSize(), 0 );
}
public String getFootnoteSeparator() {
@Deprecated
public String getFootnoteSeparator()
{
return getAt( 0 );
}
public String getFootnoteContSeparator() {
@Deprecated
public String getFootnoteContSeparator()
{
return getAt( 1 );
}
public String getFootnoteContNote() {
@Deprecated
public String getFootnoteContNote()
{
return getAt( 2 );
}
public String getEndnoteSeparator() {
@Deprecated
public String getEndnoteSeparator()
{
return getAt( 3 );
}
public String getEndnoteContSeparator() {
@Deprecated
public String getEndnoteContSeparator()
{
return getAt( 4 );
}
public String getEndnoteContNote() {
@Deprecated
public String getEndnoteContNote()
{
return getAt( 5 );
}
public Range getFootnoteSeparatorSubrange()
{
return getSubrangeAt( 0 );
}
public Range getFootnoteContSeparatorSubrange()
{
return getSubrangeAt( 1 );
}
public Range getFootnoteContNoteSubrange()
{
return getSubrangeAt( 2 );
}
public Range getEndnoteSeparatorSubrange()
{
return getSubrangeAt( 3 );
}
public Range getEndnoteContSeparatorSubrange()
{
return getSubrangeAt( 4 );
}
public Range getEndnoteContNoteSubrange()
{
return getSubrangeAt( 5 );
}
@Deprecated
public String getEvenHeader() {
return getAt(6+0);
}
@Deprecated
public String getOddHeader() {
return getAt(6+1);
}
@Deprecated
public String getFirstHeader() {
return getAt(6+4);
}
public Range getEvenHeaderSubrange() {
return getSubrangeAt(6+0);
}
public Range getOddHeaderSubrange() {
return getSubrangeAt(6+1);
}
public Range getFirstHeaderSubrange() {
return getSubrangeAt(6+4);
}
/**
* Returns the correct, defined header for the given
* one based page
@ -135,16 +196,39 @@ public final class HeaderStories {
return getOddHeader();
}
public String getEvenFooter() {
@Deprecated
public String getEvenFooter()
{
return getAt( 6 + 2 );
}
public String getOddFooter() {
@Deprecated
public String getOddFooter()
{
return getAt( 6 + 3 );
}
public String getFirstFooter() {
@Deprecated
public String getFirstFooter()
{
return getAt( 6 + 5 );
}
public Range getEvenFooterSubrange()
{
return getSubrangeAt( 6 + 2 );
}
public Range getOddFooterSubrange()
{
return getSubrangeAt( 6 + 3 );
}
public Range getFirstFooterSubrange()
{
return getSubrangeAt( 6 + 5 );
}
/**
* Returns the correct, defined footer for the given
* one based page
@ -174,6 +258,7 @@ public final class HeaderStories {
* Get the string that's pointed to by the
* given plcfHdd index
*/
@Deprecated
private String getAt(int plcfHddIndex) {
if(plcfHdd == null) return null;
@ -209,6 +294,32 @@ public final class HeaderStories {
return text;
}
private Range getSubrangeAt( int plcfHddIndex )
{
if ( plcfHdd == null )
return null;
GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
if ( prop.getStart() == prop.getEnd() )
{
// Empty story
return null;
}
if ( prop.getEnd() < prop.getStart() )
{
// Broken properties?
return null;
}
final int headersLength = headerStories.getEndOffset()
- headerStories.getStartOffset();
int start = Math.min( prop.getStart(), headersLength );
int end = Math.min( prop.getEnd(), headersLength );
return new Range( headerStories.getStartOffset() + start,
headerStories.getStartOffset() + end, headerStories );
}
public Range getRange() {
return headerStories;
}

View File

@ -0,0 +1,34 @@
package org.apache.poi.hwpf.usermodel;
import java.io.FileNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.util.Internal;
@Internal
public class ObjectPoolImpl implements ObjectsPool
{
private DirectoryEntry _objectPool;
public ObjectPoolImpl( DirectoryEntry _objectPool )
{
super();
this._objectPool = _objectPool;
}
public Entry getObjectById( String objId )
{
if ( _objectPool == null )
return null;
try
{
return _objectPool.getEntry( objId );
}
catch ( FileNotFoundException exc )
{
return null;
}
}
}

View File

@ -0,0 +1,8 @@
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.poifs.filesystem.Entry;
public interface ObjectsPool
{
public Entry getObjectById( String objId );
}

View File

@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* This class is the central class of the HWPF object model. All properties that
@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
*/
public class Range { // TODO -instantiable superclass
private POILogger logger = POILogFactory.getLogger( Range.class );
public static final int TYPE_PARAGRAPH = 0;
public static final int TYPE_CHARACTER = 1;
public static final int TYPE_SECTION = 2;
@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
initAll();
if ( tableEndInclusive >= this._parEnd )
{
throw new ArrayIndexOutOfBoundsException(
"The table's bounds fall outside of this Range" );
logger.log( POILogger.WARN, "The table's bounds ", "["
+ this._parStart + "; " + tableEndInclusive + ")",
" fall outside of this Range paragraphs numbers ", "["
+ this._parStart + "; " + this._parEnd + ")" );
}
if ( tableEndInclusive < 0 )
{
throw new ArrayIndexOutOfBoundsException(

View File

@ -0,0 +1,22 @@
package org.apache.poi.hwpf.converter;
import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples;
public class TestWordToTextConverter extends TestCase
{
/**
* [FAILING] Bug 47731 - Word Extractor considers text copied from some
* website as an embedded object
*/
public void testBug47731() throws Exception
{
HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
String foundText = WordToTextConverter.getText( doc );
assertTrue( foundText
.contains( "Soak the rice in water for three to four hours" ) );
}
}

View File

@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* @author Nick Burch (nick at torchbox dot com)
*/
public final class TestWordExtractor extends TestCase {
public static void assertEquals( String expected, String actual )
{
String newExpected = expected.replaceAll( "\r\n", "\n" )
.replaceAll( "\r", "\n" ).trim();
String newActual = actual.replaceAll( "\r\n", "\n" )
.replaceAll( "\r", "\n" ).trim();
TestCase.assertEquals( newExpected, newActual );
}
private String[] p_text1 = new String[] {
"This is a simple word document\r\n",
"\r\n",
@ -109,9 +119,11 @@ public final class TestWordExtractor extends TestCase {
// For the 2nd, should give similar answers for
// the two methods, differing only in line endings
assertEquals(
extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
extractor2.getText().replaceAll("[\\r\\n]", ""));
// nope, they must have different results, because of garbage
// assertEquals(
// extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
// extractor2.getText().replaceAll("[\\r\\n]", ""));
}
/**
@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
// Open directly
for(DirectoryNode dir : files) {
WordExtractor extractor = new WordExtractor(dir, null);
WordExtractor extractor = new WordExtractor(dir);
assertEquals(p_text1_block, extractor.getText());
}

View File

@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
public class TestBugs extends TestCase
{
public static void assertEquals( String expected, String actual )
{
String newExpected = expected.replaceAll( "\r\n", "\n" )
.replaceAll( "\r", "\n" ).trim();
String newActual = actual.replaceAll( "\r\n", "\n" )
.replaceAll( "\r", "\n" ).trim();
TestCase.assertEquals( newExpected, newActual );
}
private static void assertTableStructures( Range expected, Range actual )
{
assertEquals( expected.numParagraphs(), actual.numParagraphs() );