replace ComplexFileTable with single-element-one right after load; replace text piece table as well

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1150675 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-25 12:58:09 +00:00
parent 960c02c10c
commit c10da4c31a
15 changed files with 323 additions and 391 deletions

View File

@ -23,8 +23,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hwpf.model.BookmarksTables;
import org.apache.poi.hwpf.model.CHPBinTable;
@ -40,6 +38,7 @@ import org.apache.poi.hwpf.model.NoteType;
import org.apache.poi.hwpf.model.NotesTables;
import org.apache.poi.hwpf.model.PAPBinTable;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.SectionTable;
@ -92,7 +91,7 @@ public final class HWPFDocument extends HWPFDocumentCore
* structure*/
protected ComplexFileTable _cft;
protected TextPieceTable _tpt;
protected final StringBuilder _text;
/** Holds the save history for this document. */
protected SavedByTable _sbt;
@ -139,6 +138,7 @@ public final class HWPFDocument extends HWPFDocumentCore
protected HWPFDocument()
{
super();
this._text = new StringBuilder("\r");
}
/**
@ -246,15 +246,35 @@ public final class HWPFDocument extends HWPFDocumentCore
// Start to load up our standard structures.
_dop = new DocumentProperties(_tableStream, _fib.getFcDop());
_cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
_tpt = _cft.getTextPieceTable();
TextPieceTable _tpt = _cft.getTextPieceTable();
// Now load the rest of the properties, which need to be adjusted
// for where text really begin
_cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
_text = _tpt.getText();
_cbt.rebuild( _cft );
_pbt.rebuild( _dataStream, _cft );
_pbt.rebuild( _text, _dataStream, _cft );
boolean preserve = false;
try
{
preserve = Boolean.parseBoolean( System
.getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
}
catch ( Exception exc )
{
// ignore;
}
if ( !preserve )
{
_cft = new ComplexFileTable();
_tpt = _cft.getTextPieceTable();
_tpt.add( new TextPiece( 0, _text.length(), _text.toString()
.getBytes( "UTF-16LE" ), new PieceDescriptor( new byte[8],
0 ) ) );
}
// Read FSPA and Escher information
_fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
@ -314,6 +334,12 @@ public final class HWPFDocument extends HWPFDocumentCore
return _cft.getTextPieceTable();
}
@Override
public StringBuilder getText()
{
return _text;
}
@Deprecated
public CPSplitCalculator getCPSplitCalculator()
{
@ -326,10 +352,7 @@ public final class HWPFDocument extends HWPFDocumentCore
}
public Range getOverallRange() {
// hack to get the ending cp of the document, Have to revisit this.
TextPiece p = _tpt.getTextPieces().get(_tpt.getTextPieces().size() - 1);
return new Range(0, p.getEnd(), this);
return new Range(0, _text.length(), this);
}
/**
@ -445,16 +468,7 @@ public final class HWPFDocument extends HWPFDocumentCore
*/
public int characterLength()
{
List<TextPiece> textPieces = _tpt.getTextPieces();
Iterator<TextPiece> textIt = textPieces.iterator();
int length = 0;
while(textIt.hasNext())
{
TextPiece tp = textIt.next();
length += tp.characterLength();
}
return length;
return _text.length();
}
/**
@ -643,7 +657,7 @@ public final class HWPFDocument extends HWPFDocumentCore
// write out the PAPBinTable.
_fib.setFcPlcfbtePapx(tableOffset);
_pbt.writeTo(docSys, fcMin);
_pbt.writeTo(docSys, fcMin, _cft.getTextPieceTable());
_fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();

View File

@ -35,6 +35,7 @@ import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Internal;
/**
@ -161,8 +162,20 @@ public abstract class HWPFDocumentCore extends POIDocument
*/
public abstract Range getOverallRange();
public abstract TextPieceTable getTextTable();
/**
* Returns document text, i.e. text information from all text pieces,
* including OLE descriptions and field codes
*/
public String getDocumentText() {
return getText().toString();
}
/**
* Internal method to access document text
*/
@Internal
public abstract StringBuilder getText();
public CHPBinTable getCharacterTable()
{
return _cbt;
@ -197,4 +210,6 @@ public abstract class HWPFDocumentCore extends POIDocument
{
return _fib;
}
public abstract TextPieceTable getTextTable();
}

View File

@ -38,6 +38,8 @@ import org.apache.poi.util.LittleEndian;
public class HWPFOldDocument extends HWPFDocumentCore {
private TextPieceTable tpt;
private StringBuilder _text;
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
@ -88,13 +90,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
TextPiece tp = new TextPiece(
0, textData.length, textData, pd, 0
0, textData.length, textData, pd
);
tpt.add(tp);
text.append(tp.getStringBuffer());
}
_text = tpt.getText();
// Now we can fetch the character and paragraph properties
_cbt = new OldCHPBinTable(
_mainStream, chpTableOffset, chpTableSize,
@ -126,6 +130,12 @@ public class HWPFOldDocument extends HWPFDocumentCore {
return tpt;
}
@Override
public StringBuilder getText()
{
return _text;
}
@Override
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");

View File

@ -23,9 +23,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@ -37,10 +35,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.GenericPropertyNode;
import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
import org.apache.poi.hwpf.model.PAPX;
import org.apache.poi.hwpf.model.PlexOfCps;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.sprm.SprmIterator;
@ -51,10 +46,8 @@ import org.apache.poi.hwpf.usermodel.Field;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
/**
* Used by developers to list out key information on a HWPF file. End users will
@ -241,13 +234,10 @@ public final class HWPFLister
private LinkedHashMap<Integer, String> paragraphs;
private String text;
public HWPFLister( HWPFDocumentCore doc )
{
_doc = doc;
buildText();
buildParagraphs();
}
@ -256,6 +246,7 @@ public final class HWPFLister
paragraphs = new LinkedHashMap<Integer, String>();
StringBuilder part = new StringBuilder();
String text = _doc.getDocumentText();
for ( int charIndex = 0; charIndex < text.length(); charIndex++ )
{
char c = text.charAt( charIndex );
@ -268,24 +259,6 @@ public final class HWPFLister
}
}
private void buildText()
{
StringBuilder builder = new StringBuilder();
for ( TextPiece textPiece : _doc.getTextTable().getTextPieces() )
{
String toAppend = textPiece.getStringBuffer().toString();
if ( toAppend.length() != ( textPiece.getEnd() - textPiece
.getStart() ) )
{
throw new AssertionError();
}
builder.replace( textPiece.getStart(), textPiece.getEnd(), toAppend );
}
this.text = builder.toString();
}
private void dumpBookmarks()
{
if ( !( _doc instanceof HWPFDocument ) )
@ -379,69 +352,69 @@ public final class HWPFLister
public void dumpPapx( boolean withProperties ) throws Exception
{
if ( _doc instanceof HWPFDocument )
{
System.out.println( "binary PAP pages " );
HWPFDocument doc = (HWPFDocument) _doc;
java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
.getDeclaredField( "_mainStream" );
fMainStream.setAccessible( true );
byte[] mainStream = (byte[]) fMainStream.get( _doc );
PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
.getFileInformationBlock().getFcPlcfbtePapx(), doc
.getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
List<PAPX> papxs = new ArrayList<PAPX>();
int length = binTable.length();
for ( int x = 0; x < length; x++ )
{
GenericPropertyNode node = binTable.getProperty( x );
int pageNum = LittleEndian.getInt( node.getBytes() );
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
* pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
mainStream, doc.getDataStream(), pageOffset,
doc.getTextTable() );
System.out.println( "* PFKP: " + pfkp );
for ( PAPX papx : pfkp.getPAPXs() )
{
System.out.println( "** " + papx );
papxs.add( papx );
if ( papx != null && true )
{
SprmIterator sprmIt = new SprmIterator(
papx.getGrpprl(), 2 );
while ( sprmIt.hasNext() )
{
SprmOperation sprm = sprmIt.next();
System.out.println( "*** " + sprm.toString() );
}
}
}
}
Collections.sort( papxs );
System.out.println( "* Sorted by END" );
for ( PAPX papx : papxs )
{
System.out.println( "** " + papx );
SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
while ( sprmIt.hasNext() )
{
SprmOperation sprm = sprmIt.next();
System.out.println( "*** " + sprm.toString() );
}
}
}
// if ( _doc instanceof HWPFDocument )
// {
// System.out.println( "binary PAP pages " );
//
// HWPFDocument doc = (HWPFDocument) _doc;
//
// java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
// .getDeclaredField( "_mainStream" );
// fMainStream.setAccessible( true );
// byte[] mainStream = (byte[]) fMainStream.get( _doc );
//
// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
// .getFileInformationBlock().getFcPlcfbtePapx(), doc
// .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
//
// List<PAPX> papxs = new ArrayList<PAPX>();
//
// int length = binTable.length();
// for ( int x = 0; x < length; x++ )
// {
// GenericPropertyNode node = binTable.getProperty( x );
//
// int pageNum = LittleEndian.getInt( node.getBytes() );
// int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
// * pageNum;
//
// PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
// mainStream, doc.getDataStream(), pageOffset,
// doc.getTextTable() );
//
// System.out.println( "* PFKP: " + pfkp );
//
// for ( PAPX papx : pfkp.getPAPXs() )
// {
// System.out.println( "** " + papx );
// papxs.add( papx );
// if ( papx != null && true )
// {
// SprmIterator sprmIt = new SprmIterator(
// papx.getGrpprl(), 2 );
// while ( sprmIt.hasNext() )
// {
// SprmOperation sprm = sprmIt.next();
// System.out.println( "*** " + sprm.toString() );
// }
// }
//
// }
// }
//
// Collections.sort( papxs );
// System.out.println( "* Sorted by END" );
// for ( PAPX papx : papxs )
// {
// System.out.println( "** " + papx );
// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
// while ( sprmIt.hasNext() )
// {
// SprmOperation sprm = sprmIt.next();
// System.out.println( "*** " + sprm.toString() );
// }
// }
// }
// for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
// {

View File

@ -20,13 +20,11 @@ package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
@ -218,22 +216,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
* mapping is broken. Fast too.
*/
public String getTextFromPieces() {
StringBuffer textBuf = new StringBuffer();
for(TextPiece piece : doc.getTextTable().getTextPieces()) {
String encoding = "Cp1252";
if (piece.isUnicode()) {
encoding = "UTF-16LE";
}
try {
String text = new String(piece.getRawBytes(), encoding);
textBuf.append(text);
} catch(UnsupportedEncodingException e) {
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
}
}
String text = textBuf.toString();
String text = doc.getDocumentText();
// Fix line endings (Note - won't get all of them
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");

View File

@ -179,34 +179,6 @@ public class CHPBinTable
start = System.currentTimeMillis();
}
// rebuild document paragraphs structure
StringBuilder docText = new StringBuilder();
for ( TextPiece textPiece : tpt.getTextPieces() )
{
String toAppend = textPiece.getStringBuffer().toString();
int toAppendLength = toAppend.length();
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
{
logger.log(
POILogger.WARN,
"Text piece has boundaries [",
Integer.valueOf( textPiece.getStart() ),
"; ",
Integer.valueOf( textPiece.getEnd() ),
") but length ",
Integer.valueOf( textPiece.getEnd()
- textPiece.getStart() ) );
}
docText.replace( textPiece.getStart(), textPiece.getStart()
+ toAppendLength, toAppend );
}
logger.log( POILogger.DEBUG, "Document text rebuilded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( docText.length() ), " chars)" );
start = System.currentTimeMillis();
List<CHPX> oldChpxSortedByStartPos = new ArrayList<CHPX>( _textRuns );
Collections.sort( oldChpxSortedByStartPos,
PropertyNode.StartComparator.instance );

View File

@ -54,9 +54,6 @@ public class PAPBinTable
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
byte[] _dataStream;
/** So we can know if things are unicode or not */
private TextPieceTable tpt;
public PAPBinTable()
{
}
@ -81,7 +78,6 @@ public class PAPBinTable
{
PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
this.tpt = tpt;
int length = binTable.length();
for ( int x = 0; x < length; x++ )
@ -112,7 +108,8 @@ public class PAPBinTable
Integer.valueOf( _paragraphs.size() ), " elements)" );
}
public void rebuild( byte[] dataStream, ComplexFileTable complexFileTable )
public void rebuild( final StringBuilder docText, byte[] dataStream,
ComplexFileTable complexFileTable )
{
long start = System.currentTimeMillis();
@ -121,7 +118,8 @@ public class PAPBinTable
SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
// adding PAPX from fast-saved SPRMs
for ( TextPiece textPiece : tpt.getTextPieces() )
for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
.getTextPieces() )
{
PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
if ( !prm.isComplex() )
@ -167,34 +165,6 @@ public class PAPBinTable
start = System.currentTimeMillis();
}
// rebuild document paragraphs structure
StringBuilder docText = new StringBuilder();
for ( TextPiece textPiece : tpt.getTextPieces() )
{
String toAppend = textPiece.getStringBuffer().toString();
int toAppendLength = toAppend.length();
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
{
logger.log(
POILogger.WARN,
"Text piece has boundaries [",
Integer.valueOf( textPiece.getStart() ),
"; ",
Integer.valueOf( textPiece.getEnd() ),
") but length ",
Integer.valueOf( textPiece.getEnd()
- textPiece.getStart() ) );
}
docText.replace( textPiece.getStart(), textPiece.getStart()
+ toAppendLength, toAppend );
}
logger.log( POILogger.DEBUG, "Document text rebuilded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( docText.length() ), " chars)" );
start = System.currentTimeMillis();
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
Collections.sort( oldPapxSortedByEndPos,
PropertyNode.EndComparator.instance );
@ -274,7 +244,8 @@ public class PAPBinTable
{
// can we reuse existing?
PAPX existing = papxs.get( 0 );
if ( existing.getStart() == startInclusive && existing.getEnd() == endExclusive )
if ( existing.getStart() == startInclusive
&& existing.getEnd() == endExclusive )
{
newPapxs.add( existing );
lastParStart = endExclusive;
@ -311,7 +282,8 @@ public class PAPBinTable
this._paragraphs = new ArrayList<PAPX>( newPapxs );
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms" );
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" );
start = System.currentTimeMillis();
_dataStream = dataStream;
@ -320,7 +292,7 @@ public class PAPBinTable
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
PAPX forInsert = new PAPX(0, 0, buf, _dataStream);
// Ensure character offsets are really characters
forInsert.setStart(cpStart);
@ -350,7 +322,7 @@ public class PAPBinTable
// Original, until insert at point
// New one
// Clone of original, on to the old end
PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream);
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(currentPap.getEnd());
@ -427,9 +399,8 @@ public class PAPBinTable
return _paragraphs;
}
public void writeTo(HWPFFileSystem sys, int fcMin)
throws IOException
{
public void writeTo( HWPFFileSystem sys, int fcMin, CharIndexTranslator translator ) throws IOException
{
HWPFOutputStream docStream = sys.getStream("WordDocument");
OutputStream tableStream = sys.getStream("1Table");
@ -463,7 +434,7 @@ public class PAPBinTable
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(_dataStream);
pfkp.fill(overflow);
byte[] bufFkp = pfkp.toByteArray(tpt, fcMin);
byte[] bufFkp = pfkp.toByteArray(translator, fcMin);
docStream.write(bufFkp);
overflow = pfkp.getOverflow();

View File

@ -19,6 +19,9 @@ package org.apache.poi.hwpf.model;
import java.io.UnsupportedEncodingException;
import org.apache.poi.util.Internal;
/**
* Lightweight representation of a text piece.
* Works in the character domain, not the byte domain, so you
@ -27,19 +30,39 @@ import java.io.UnsupportedEncodingException;
*
* @author Ryan Ackley
*/
@Internal
public final class TextPiece extends PropertyNode<TextPiece>
{
private boolean _usesUnicode;
private PieceDescriptor _pd;
/**
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
*/
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
/**
* @param start
* Beginning offset in main document stream, in characters.
* @param end
* Ending offset in main document stream, in characters.
* @param text
* The raw bytes of our text
* @deprecated Use {@link #TextPiece(int,int,byte[],PieceDescriptor)}
* instead
*/
public TextPiece( int start, int end, byte[] text, PieceDescriptor pd,
int cpStart )
{
this( start, end, text, pd );
}
/**
* @param start
* Beginning offset in main document stream, in characters.
* @param end
* Ending offset in main document stream, in characters.
* @param text
* The raw bytes of our text
*/
public TextPiece( int start, int end, byte[] text, PieceDescriptor pd )
{
super(start, end, buildInitSB(text, pd));
_usesUnicode = pd.isUnicode();
_pd = pd;

View File

@ -24,6 +24,8 @@ import java.util.List;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* The piece table for matching up character positions to bits of text. This
@ -34,6 +36,9 @@ import org.apache.poi.poifs.common.POIFSConstants;
*/
public class TextPieceTable implements CharIndexTranslator
{
private static final POILogger logger = POILogFactory
.getLogger( TextPieceTable.class );
// int _multiple;
int _cpMin;
protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
@ -101,7 +106,7 @@ public class TextPieceTable implements CharIndexTranslator
// And now build the piece
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
pieces[x], node.getStart() ) );
pieces[x] ) );
}
// In the interest of our sanity, now sort the text pieces
@ -251,6 +256,41 @@ public class TextPieceTable implements CharIndexTranslator
return _cpMin;
}
public StringBuilder getText()
{
final long start = System.currentTimeMillis();
// rebuild document paragraphs structure
StringBuilder docText = new StringBuilder();
for ( TextPiece textPiece : _textPieces )
{
String toAppend = textPiece.getStringBuffer().toString();
int toAppendLength = toAppend.length();
if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
{
logger.log(
POILogger.WARN,
"Text piece has boundaries [",
Integer.valueOf( textPiece.getStart() ),
"; ",
Integer.valueOf( textPiece.getEnd() ),
") but length ",
Integer.valueOf( textPiece.getEnd()
- textPiece.getStart() ) );
}
docText.replace( textPiece.getStart(), textPiece.getStart()
+ toAppendLength, toAppend );
}
logger.log( POILogger.DEBUG, "Document text were rebuilded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( docText.length() ), " chars)" );
return docText;
}
public List<TextPiece> getTextPieces()
{
return _textPieces;

View File

@ -31,7 +31,7 @@ import org.apache.poi.hwpf.model.PropertyNode;
import org.apache.poi.hwpf.model.SEPX;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.SubdocumentType;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -108,18 +108,8 @@ public class Range { // TODO -instantiable superclass
/** The end index in the characterRuns list for this Range. */
protected int _charEnd;
/** Have we loaded the Text indexes yet */
protected boolean _textRangeFound;
/** All text pieces that belong to the document this Range belongs to. */
protected List<TextPiece> _text;
/** The start index in the text list for this Range. */
protected int _textStart;
/** The end index in the text list for this Range. */
protected int _textEnd;
protected StringBuilder _text;
// protected Range()
// {
//
@ -144,7 +134,7 @@ public class Range { // TODO -instantiable superclass
_sections = _doc.getSectionTable().getSections();
_paragraphs = _doc.getParagraphTable().getParagraphs();
_characters = _doc.getCharacterTable().getTextRuns();
_text = _doc.getTextTable().getTextPieces();
_text = _doc.getText();
_parent = new WeakReference<Range>(null);
sanityCheckStartEnd();
@ -171,6 +161,7 @@ public class Range { // TODO -instantiable superclass
_parent = new WeakReference<Range>(parent);
sanityCheckStartEnd();
assert sanityCheck();
}
/**
@ -212,23 +203,17 @@ public class Range { // TODO -instantiable superclass
}
}
/**
* Does any <code>TextPiece</code> in this Range use unicode?
*
* @return true if it does and false if it doesn't
*/
public boolean usesUnicode() {
initText();
for (int i = _textStart; i < _textEnd; i++) {
TextPiece piece = _text.get(i);
if (piece.isUnicode())
return true;
}
return false;
}
/**
* @return always return true
* @deprecated Range is not linked to any text piece anymore, so to check if
* unicode is used please access {@link TextPieceTable} during
* document load time
*/
@Deprecated
public boolean usesUnicode()
{
return true;
}
/**
* Gets the text that this Range contains.
@ -236,29 +221,7 @@ public class Range { // TODO -instantiable superclass
* @return The text for this range.
*/
public String text() {
initText();
StringBuffer sb = new StringBuffer();
for (int x = _textStart; x < _textEnd; x++) {
TextPiece piece = _text.get(x);
// Figure out where in this piece the text
// we're after lives
int rStart = 0;
int rEnd = piece.characterLength();
if (_start > piece.getStart()) {
rStart = _start - piece.getStart();
}
if (_end < piece.getEnd()) {
rEnd -= (piece.getEnd() - _end);
}
// Luckily TextPieces work in characters, so we don't
// need to worry about unicode here
sb.append(piece.substring(rStart, rEnd));
}
return sb.toString();
return _text.substring( _start, _end );
}
/**
@ -346,67 +309,52 @@ public class Range { // TODO -instantiable superclass
return _charEnd - _charStart;
}
/**
* Inserts text into the front of this range.
*
* @param text
* The text to insert
* @return The character run that text was inserted into.
*/
public CharacterRun insertBefore(String text)
// throws UnsupportedEncodingException
{
initAll();
/**
* Inserts text into the front of this range.
*
* @param text
* The text to insert
* @return The character run that text was inserted into.
*/
public CharacterRun insertBefore( String text )
{
initAll();
TextPiece tp = _text.get(_textStart);
StringBuffer sb = tp.getStringBuffer();
_text.insert( _start, text );
_doc.getCharacterTable().adjustForInsert( _charStart, text.length() );
_doc.getParagraphTable().adjustForInsert( _parStart, text.length() );
_doc.getSectionTable().adjustForInsert( _sectionStart, text.length() );
adjustForInsert( text.length() );
// Since this is the first item in our list, it is safe to assume that
// _start >= tp.getStart()
int insertIndex = _start - tp.getStart();
sb.insert(insertIndex, text);
// update the FIB.CCPText + friends fields
adjustFIB( text.length() );
int adjustedLength = _doc.getTextTable().adjustForInsert(_textStart, text.length());
_doc.getCharacterTable().adjustForInsert(_charStart, adjustedLength);
_doc.getParagraphTable().adjustForInsert(_parStart, adjustedLength);
_doc.getSectionTable().adjustForInsert(_sectionStart, adjustedLength);
adjustForInsert(adjustedLength);
assert sanityCheck();
// update the FIB.CCPText + friends fields
adjustFIB(text.length());
return getCharacterRun( 0 );
}
return getCharacterRun(0);
}
/**
* Inserts text onto the end of this range
*
* @param text
* The text to insert
* @return The character run the text was inserted into.
*/
public CharacterRun insertAfter( String text )
{
initAll();
/**
* Inserts text onto the end of this range
*
* @param text
* The text to insert
* @return The character run the text was inserted into.
*/
public CharacterRun insertAfter(String text) {
initAll();
_text.insert( _end, text );
int listIndex = _textEnd - 1;
TextPiece tp = _text.get(listIndex);
StringBuffer sb = tp.getStringBuffer();
_doc.getCharacterTable().adjustForInsert( _charEnd - 1, text.length() );
_doc.getParagraphTable().adjustForInsert( _parEnd - 1, text.length() );
_doc.getSectionTable().adjustForInsert( _sectionEnd - 1, text.length() );
adjustForInsert( text.length() );
int insertIndex = _end - tp.getStart();
if (tp.getStringBuffer().charAt(_end - 1) == '\r' && text.charAt(0) != '\u0007') {
insertIndex--;
}
sb.insert(insertIndex, text);
int adjustedLength = _doc.getTextTable().adjustForInsert(listIndex, text.length());
_doc.getCharacterTable().adjustForInsert(_charEnd - 1, adjustedLength);
_doc.getParagraphTable().adjustForInsert(_parEnd - 1, adjustedLength);
_doc.getSectionTable().adjustForInsert(_sectionEnd - 1, adjustedLength);
adjustForInsert(text.length());
return getCharacterRun(numCharacterRuns() - 1);
}
assert sanityCheck();
return getCharacterRun( numCharacterRuns() - 1 );
}
/**
* Inserts text into the front of this range and it gives that text the
@ -580,7 +528,6 @@ public class Range { // TODO -instantiable superclass
int numSections = _sections.size();
int numRuns = _characters.size();
int numParagraphs = _paragraphs.size();
int numTextPieces = _text.size();
for (int x = _charStart; x < numRuns; x++) {
CHPX chpx = _characters.get(x);
@ -605,10 +552,12 @@ public class Range { // TODO -instantiable superclass
// + " -> " + sepx.getEnd());
}
for (int x = _textStart; x < numTextPieces; x++) {
TextPiece piece = _text.get(x);
piece.adjustForDelete(_start, _end - _start);
}
_text.delete( _start, _end );
Range parent = _parent.get();
if ( parent != null )
{
parent.adjustForInsert( -( _end - _start ) );
}
// update the FIB.CCPText + friends field
adjustFIB(-(_end - _start));
@ -623,7 +572,7 @@ public class Range { // TODO -instantiable superclass
* @param rows
* The number of rows.
* @return The empty Table that is now part of the document.
* @deprecated Use code shall not work with {@link ParagraphProperties}
* @deprecated Use code shall not work with {@link TableProperties}
*/
@Deprecated
public Table insertBefore(TableProperties props, int rows) {
@ -631,19 +580,28 @@ public class Range { // TODO -instantiable superclass
parProps.setFInTable(true);
parProps.setItap( 1 );
final int oldEnd = this._end;
int columns = props.getItcMac();
for (int x = 0; x < rows; x++) {
Paragraph cell = this.insertBefore(parProps, StyleSheet.NIL_STYLE);
cell.insertAfter(String.valueOf('\u0007'));
for (int y = 1; y < columns; y++) {
cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE);
cell.insertAfter(String.valueOf('\u0007'));
}
cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE, String.valueOf('\u0007'));
cell.setTableRowEnd(props);
}
return new Table(_start, _start + (rows * (columns + 1)) * 2, this, 1);
}
for ( int x = 0; x < rows; x++ )
{
Paragraph cell = this.insertBefore( parProps, StyleSheet.NIL_STYLE );
cell.insertAfter( String.valueOf( '\u0007' ) );
for ( int y = 1; y < columns; y++ )
{
cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE );
cell.insertAfter( String.valueOf( '\u0007' ) );
}
cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE,
String.valueOf( '\u0007' ) );
cell.setTableRowEnd( props );
}
final int newEnd = this._end;
final int diff = newEnd - oldEnd;
return new Table( _start, _start + diff, this, 1 );
}
/**
* Inserts a list into the beginning of this range.
@ -715,23 +673,14 @@ public class Range { // TODO -instantiable superclass
*/
public void replaceText(String pPlaceHolder, String pValue, int pOffset) {
int absPlaceHolderIndex = getStartOffset() + pOffset;
Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder
.length()), getDocument());
// this Range isn't a proper parent of the subRange() so we'll have to
// keep
// track of an updated endOffset on our own
int previousEndOffset = subRange.getEndOffset();
.length()), this);
subRange.insertBefore(pValue);
if (subRange.getEndOffset() != previousEndOffset) {
adjustForInsert(subRange.getEndOffset() - previousEndOffset);
}
// re-create the sub-range so we can delete it
subRange = new Range((absPlaceHolderIndex + pValue.length()), (absPlaceHolderIndex
+ pPlaceHolder.length() + pValue.length()), getDocument());
+ pPlaceHolder.length() + pValue.length()), this);
// deletes are automagically propagated
subRange.delete();
@ -921,7 +870,6 @@ public class Range { // TODO -instantiable superclass
* loads all of the list indexes.
*/
protected void initAll() {
initText();
initCharacterRuns();
initParagraphs();
initSections();
@ -951,18 +899,6 @@ public class Range { // TODO -instantiable superclass
}
}
/**
* inits the text piece list indexes.
*/
private void initText() {
if (!_textRangeFound) {
int[] point = findRange(_text, _textStart, _start, _end);
_textStart = point[0];
_textEnd = point[1];
_textRangeFound = true;
}
}
/**
* inits the section list indexes.
*/
@ -1038,7 +974,6 @@ public class Range { // TODO -instantiable superclass
* resets the list indexes.
*/
protected void reset() {
_textRangeFound = false;
_charRangeFound = false;
_parRangeFound = false;
_sectionRangeFound = false;
@ -1153,8 +1088,19 @@ public class Range { // TODO -instantiable superclass
* Method for debug purposes. Checks that all resolved elements are inside
* of current range.
*/
public void sanityCheck()
public boolean sanityCheck()
{
if ( _start < 0 )
throw new AssertionError();
if ( _start >= _text.length() )
throw new AssertionError();
if ( _end < 0 )
throw new AssertionError();
if ( _end > _text.length() )
throw new AssertionError();
if ( _start > _end )
throw new AssertionError();
if ( _charRangeFound )
{
for ( int c = _charStart; c < _charEnd; c++ )
@ -1181,5 +1127,7 @@ public class Range { // TODO -instantiable superclass
throw new AssertionError();
}
}
return true;
}
}

View File

@ -17,16 +17,13 @@
package org.apache.poi.hwpf.extractor;
import java.util.Iterator;
import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import junit.framework.TestCase;
/**
* Test the different routes to extracting text
*
@ -78,24 +75,10 @@ public final class TestDifferentRoutes extends TestCase {
* Test textPieces based extraction
*/
public void testExtractFromTextPieces() throws Exception {
StringBuffer textBuf = new StringBuffer();
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
while (textPieces.hasNext()) {
TextPiece piece = (TextPiece) textPieces.next();
String encoding = "Cp1252";
if (piece.isUnicode()) {
encoding = "UTF-16LE";
}
String text = new String(piece.getRawBytes(), encoding);
textBuf.append(text);
}
StringBuffer exp = new StringBuffer();
for (int i = 0; i < p_text.length; i++) {
exp.append(p_text[i]);
}
assertEquals(exp.toString(), textBuf.toString());
assertEquals(exp.toString(), doc.getDocumentText());
}
}

View File

@ -53,7 +53,7 @@ public final class TestPAPBinTable extends TestCase
HWPFFileSystem fileSys = new HWPFFileSystem();
_pAPBinTable.writeTo( fileSys, 0 );
_pAPBinTable.writeTo( fileSys, 0, fakeTPT );
ByteArrayOutputStream tableOut = fileSys.getStream( "1Table" );
ByteArrayOutputStream mainOut = fileSys.getStream( "WordDocument" );

View File

@ -169,6 +169,7 @@ public final class TestTextPieceTable extends TestCase {
throws Exception
{
super.setUp();
System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.TRUE.toString() );
_hWPFDocFixture = new HWPFDocFixture(this, HWPFDocFixture.DEFAULT_TEST_FILE);
_hWPFDocFixture.setUp();
@ -178,8 +179,9 @@ public final class TestTextPieceTable extends TestCase {
throws Exception
{
_hWPFDocFixture.tearDown();
_hWPFDocFixture = null;
System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.FALSE.toString() );
super.tearDown();
}

View File

@ -103,10 +103,6 @@ public final class TestProblems extends HWPFTestCase {
assertEquals("One paragraph is ok\7", r.getParagraph(3).text());
assertEquals("\7", r.getParagraph(4).text());
assertEquals("\r", r.getParagraph(5).text());
for(int i=0; i<=5; i++) {
assertFalse(r.getParagraph(i).usesUnicode());
}
// Get the table
Table t = r.getTable(p);
@ -304,9 +300,6 @@ public final class TestProblems extends HWPFTestCase {
assertEquals("Row 3/Cell 3\u0007", r.getParagraph(10).text());
assertEquals("\u0007", r.getParagraph(11).text());
assertEquals("\r", r.getParagraph(12).text());
for(int i=0; i<=12; i++) {
assertFalse(r.getParagraph(i).usesUnicode());
}
Paragraph p;
@ -791,7 +784,9 @@ public final class TestProblems extends HWPFTestCase {
Paragraph actParagraph = actual.getParagraph( p );
assertEquals( expParagraph.text(), actParagraph.text() );
assertEquals( expParagraph.isInTable(), actParagraph.isInTable() );
assertEquals( "Diffent isInTable flags for paragraphs #" + p
+ " -- " + expParagraph + " -- " + actParagraph + ".",
expParagraph.isInTable(), actParagraph.isInTable() );
assertEquals( expParagraph.isTableRowEnd(),
actParagraph.isTableRowEnd() );

View File

@ -150,6 +150,8 @@ public final class TestRangeDelete extends TestCase {
assertEquals(searchText, subRange.text());
subRange.delete();
daDoc.getOverallRange().sanityCheck();
daDoc.getRange().sanityCheck();
// we need to let the model re-calculate the Range before we evaluate it
range = daDoc.getRange();
@ -166,6 +168,7 @@ public final class TestRangeDelete extends TestCase {
// this can lead to a StringBufferOutOfBoundsException, so we will add it
// even though we don't have an assertion for it
Range daRange = daDoc.getRange();
daRange.sanityCheck();
daRange.text();
}