allow to disable document rebuilding and use it in HWPF Lister

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155262 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-08-09 08:37:21 +00:00
parent 9d434a3667
commit 13068a00b2
2 changed files with 188 additions and 126 deletions

View File

@ -76,6 +76,9 @@ import org.apache.poi.util.Internal;
*/ */
public final class HWPFDocument extends HWPFDocumentCore public final class HWPFDocument extends HWPFDocumentCore
{ {
private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
/** And for making sense of CP lengths in the FIB */ /** And for making sense of CP lengths in the FIB */
@Deprecated @Deprecated
protected CPSplitCalculator _cpSplit; protected CPSplitCalculator _cpSplit;
@ -267,20 +270,43 @@ public final class HWPFDocument extends HWPFDocumentCore
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
_text = _tpt.getText(); _text = _tpt.getText();
_cbt.rebuild( _cft );
_pbt.rebuild( _text, _cft );
boolean preserve = false; /*
* in this mode we preserving PAPX/CHPX structure from file, so text may
* miss from output, and text order may be corrupted
*/
boolean preserveBinTables = false;
try try
{ {
preserve = Boolean.parseBoolean( System preserveBinTables = Boolean.parseBoolean( System
.getProperty( "org.apache.poi.hwpf.preserveTextTable" ) ); .getProperty( PROPERTY_PRESERVE_BIN_TABLES ) );
} }
catch ( Exception exc ) catch ( Exception exc )
{ {
// ignore; // ignore;
} }
if ( !preserve )
if ( !preserveBinTables )
{
_cbt.rebuild( _cft );
_pbt.rebuild( _text, _cft );
}
/*
* Property to disable text rebuilding. In this mode changing the text
* will lead to unpredictable behavior
*/
boolean preserveTextTable = false;
try
{
preserveTextTable = Boolean.parseBoolean( System
.getProperty( PROPERTY_PRESERVE_TEXT_TABLE ) );
}
catch ( Exception exc )
{
// ignore;
}
if ( !preserveTextTable )
{ {
_cft = new ComplexFileTable(); _cft = new ComplexFileTable();
_tpt = _cft.getTextPieceTable(); _tpt = _cft.getTextPieceTable();
@ -350,11 +376,13 @@ public final class HWPFDocument extends HWPFDocumentCore
_fields = new FieldsImpl(_fieldsTables); _fields = new FieldsImpl(_fieldsTables);
} }
@Internal
public TextPieceTable getTextTable() public TextPieceTable getTextTable()
{ {
return _cft.getTextPieceTable(); return _cft.getTextPieceTable();
} }
@Internal
@Override @Override
public StringBuilder getText() public StringBuilder getText()
{ {
@ -497,6 +525,7 @@ public final class HWPFDocument extends HWPFDocumentCore
* *
* @return the saved-by table. * @return the saved-by table.
*/ */
@Internal
public SavedByTable getSavedByTable() public SavedByTable getSavedByTable()
{ {
return _sbt; return _sbt;
@ -507,6 +536,7 @@ public final class HWPFDocument extends HWPFDocumentCore
* *
* @return the saved-by table. * @return the saved-by table.
*/ */
@Internal
public RevisionMarkAuthorTable getRevisionMarkAuthorTable() public RevisionMarkAuthorTable getRevisionMarkAuthorTable()
{ {
return _rmat; return _rmat;
@ -519,6 +549,7 @@ public final class HWPFDocument extends HWPFDocumentCore
return _pictures; return _pictures;
} }
@Internal
public EscherRecordHolder getEscherRecordHolder() { public EscherRecordHolder getEscherRecordHolder() {
return _escherRecordHolder; return _escherRecordHolder;
} }
@ -529,6 +560,7 @@ public final class HWPFDocument extends HWPFDocumentCore
* @deprecated use {@link #getOfficeDrawingsMain()} instead * @deprecated use {@link #getOfficeDrawingsMain()} instead
*/ */
@Deprecated @Deprecated
@Internal
public ShapesTable getShapesTable() public ShapesTable getShapesTable()
{ {
return _officeArts; return _officeArts;
@ -908,10 +940,12 @@ public final class HWPFDocument extends HWPFDocumentCore
this._dataStream = dataBuf; this._dataStream = dataBuf;
} }
@Internal
public byte[] getDataStream() public byte[] getDataStream()
{ {
return _dataStream; return _dataStream;
} }
@Internal
public byte[] getTableStream() public byte[] getTableStream()
{ {
return _tableStream; return _tableStream;

View File

@ -23,7 +23,9 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -35,7 +37,10 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FieldsDocumentPart; import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.FileInformationBlock; import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.GenericPropertyNode;
import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
import org.apache.poi.hwpf.model.PAPX; import org.apache.poi.hwpf.model.PAPX;
import org.apache.poi.hwpf.model.PlexOfCps;
import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.sprm.SprmIterator; import org.apache.poi.hwpf.sprm.SprmIterator;
@ -47,9 +52,11 @@ import org.apache.poi.hwpf.usermodel.OfficeDrawing;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Beta; import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils; import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
/** /**
* Used by developers to list out key information on a HWPF file. End users will * Used by developers to list out key information on a HWPF file. End users will
@ -94,16 +101,14 @@ public final class HWPFLister
if ( args.length == 0 ) if ( args.length == 0 )
{ {
System.err.println( "Use:" ); System.err.println( "Use:" );
System.err System.err.println( "\tHWPFLister <filename>\n"
.println( "\tHWPFLister <filename>\n"
+ "\t\t[--textPieces] [--textPiecesText]\n" + "\t\t[--textPieces] [--textPiecesText]\n"
+ "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n" + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n"
+ "\t\t[--papx] [--papxProperties]\n" + "\t\t[--papx] [--papxProperties] [--papxSprms]\n"
+ "\t\t[--paragraphs] [--paragraphsSprms] [--paragraphsText]\n" + "\t\t[--paragraphs] [--paragraphsText]\n"
+ "\t\t[--bookmarks]\n" + "\t\t[--escher]\n" + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n"
+ "\t\t[--fields]\n" + "\t\t[--pictures]\n" + "\t\t[--fields]\n" + "\t\t[--pictures]\n"
+ "\t\t[--officeDrawings]\n" + "\t\t[--officeDrawings]\n" + "\t\t[--writereadback]\n" );
+ "\t\t[--writereadback]\n" );
System.exit( 1 ); System.exit( 1 );
} }
@ -115,10 +120,10 @@ public final class HWPFLister
boolean outputChpxSprms = false; boolean outputChpxSprms = false;
boolean outputParagraphs = false; boolean outputParagraphs = false;
boolean outputParagraphsSprms = false;
boolean outputParagraphsText = false; boolean outputParagraphsText = false;
boolean outputPapx = false; boolean outputPapx = false;
boolean outputPapxSprms = false;
boolean outputPapxProperties = false; boolean outputPapxProperties = false;
boolean outputBookmarks = false; boolean outputBookmarks = false;
@ -145,8 +150,6 @@ public final class HWPFLister
if ( "--paragraphs".equals( arg ) ) if ( "--paragraphs".equals( arg ) )
outputParagraphs = true; outputParagraphs = true;
if ( "--paragraphsSprms".equals( arg ) )
outputParagraphsSprms = true;
if ( "--paragraphsText".equals( arg ) ) if ( "--paragraphsText".equals( arg ) )
outputParagraphsText = true; outputParagraphsText = true;
@ -154,6 +157,8 @@ public final class HWPFLister
outputPapx = true; outputPapx = true;
if ( "--papxProperties".equals( arg ) ) if ( "--papxProperties".equals( arg ) )
outputPapxProperties = true; outputPapxProperties = true;
if ( "--papxSprms".equals( arg ) )
outputPapxSprms = true;
if ( "--bookmarks".equals( arg ) ) if ( "--bookmarks".equals( arg ) )
outputBookmarks = true; outputBookmarks = true;
@ -174,65 +179,85 @@ public final class HWPFLister
if ( writereadback ) if ( writereadback )
doc = writeOutAndReadBack( doc ); doc = writeOutAndReadBack( doc );
HWPFLister lister = new HWPFLister( doc ); HWPFDocumentCore original;
lister.dumpFIB(); {
System.setProperty( "org.apache.poi.hwpf.preserveBinTables",
Boolean.TRUE.toString() );
System.setProperty( "org.apache.poi.hwpf.preserveTextTable",
Boolean.TRUE.toString() );
original = loadDoc( new File( args[0] ) );
if ( writereadback )
original = writeOutAndReadBack( original );
}
HWPFLister listerOriginal = new HWPFLister( original );
HWPFLister listerRebuilded = new HWPFLister( doc );
System.out.println( "== FIB (original) ==" );
listerOriginal.dumpFIB();
if ( outputTextPieces ) if ( outputTextPieces )
{ {
System.out.println( "== Text pieces ==" ); System.out.println( "== Text pieces (original) ==" );
lister.dumpTextPieces( outputTextPiecesText ); listerOriginal.dumpTextPieces( outputTextPiecesText );
} }
if ( outputChpx ) if ( outputChpx )
{ {
System.out.println( "== CHPX ==" ); System.out.println( "== CHPX (original) ==" );
lister.dumpChpx( outputChpxProperties, outputChpxSprms ); listerOriginal.dumpChpx( outputChpxProperties, outputChpxSprms );
System.out.println( "== CHPX (rebuilded) ==" );
listerRebuilded.dumpChpx( outputChpxProperties, outputChpxSprms );
} }
if ( outputPapx ) if ( outputPapx )
{ {
System.out.println( "== PAPX ==" ); System.out.println( "== PAPX (original) ==" );
lister.dumpPapx( outputPapxProperties ); listerOriginal.dumpPapx( outputPapxProperties, outputPapxSprms );
System.out.println( "== PAPX (rebuilded) ==" );
listerRebuilded.dumpPapx( outputPapxProperties, outputPapxSprms );
} }
if ( outputParagraphs ) if ( outputParagraphs )
{ {
System.out.println( "== Text paragraphs ==" ); System.out.println( "== Text paragraphs (original) ==" );
lister.dumpParagraphs( true ); listerRebuilded.dumpParagraphs( true );
System.out.println( "== DOM paragraphs ==" ); System.out.println( "== DOM paragraphs (rebuilded) ==" );
lister.dumpParagraphsDom( outputParagraphsSprms, outputPapx, listerRebuilded.dumpParagraphsDom( outputParagraphsText );
outputParagraphsText );
} }
if ( outputBookmarks ) if ( outputBookmarks )
{ {
System.out.println( "== BOOKMARKS ==" ); System.out.println( "== BOOKMARKS (rebuilded) ==" );
lister.dumpBookmarks(); listerRebuilded.dumpBookmarks();
} }
if ( outputEscher ) if ( outputEscher )
{ {
System.out.println( "== ESCHER PROPERTIES ==" ); System.out.println( "== ESCHER PROPERTIES (rebuilded) ==" );
lister.dumpEscher(); listerRebuilded.dumpEscher();
} }
if ( outputFields ) if ( outputFields )
{ {
System.out.println( "== FIELDS ==" ); System.out.println( "== FIELDS (rebuilded) ==" );
lister.dumpFields(); listerRebuilded.dumpFields();
} }
if ( outputOfficeDrawings ) if ( outputOfficeDrawings )
{ {
System.out.println( "== OFFICE DRAWINGS ==" ); System.out.println( "== OFFICE DRAWINGS (rebuilded) ==" );
lister.dumpOfficeDrawings(); listerRebuilded.dumpOfficeDrawings();
} }
if ( outputPictures ) if ( outputPictures )
{ {
System.out.println( "== PICTURES ==" ); System.out.println( "== PICTURES (rebuilded) ==" );
lister.dumpPictures(); listerRebuilded.dumpPictures();
} }
} }
@ -402,90 +427,94 @@ public final class HWPFLister
} }
} }
public void dumpPapx( boolean withProperties ) throws Exception public void dumpPapx( boolean withProperties, boolean withSprms )
throws Exception
{ {
// if ( _doc instanceof HWPFDocument ) if ( _doc instanceof HWPFDocument )
// { {
// System.out.println( "binary PAP pages " ); System.out.println( "binary PAP pages " );
//
// HWPFDocument doc = (HWPFDocument) _doc;
//
// java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
// .getDeclaredField( "_mainStream" );
// fMainStream.setAccessible( true );
// byte[] mainStream = (byte[]) fMainStream.get( _doc );
//
// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
// .getFileInformationBlock().getFcPlcfbtePapx(), doc
// .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
//
// List<PAPX> papxs = new ArrayList<PAPX>();
//
// int length = binTable.length();
// for ( int x = 0; x < length; x++ )
// {
// GenericPropertyNode node = binTable.getProperty( x );
//
// int pageNum = LittleEndian.getInt( node.getBytes() );
// int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
// * pageNum;
//
// PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
// mainStream, doc.getDataStream(), pageOffset,
// doc.getTextTable() );
//
// System.out.println( "* PFKP: " + pfkp );
//
// for ( PAPX papx : pfkp.getPAPXs() )
// {
// System.out.println( "** " + papx );
// papxs.add( papx );
// if ( papx != null && true )
// {
// SprmIterator sprmIt = new SprmIterator(
// papx.getGrpprl(), 2 );
// while ( sprmIt.hasNext() )
// {
// SprmOperation sprm = sprmIt.next();
// System.out.println( "*** " + sprm.toString() );
// }
// }
//
// }
// }
//
// Collections.sort( papxs );
// System.out.println( "* Sorted by END" );
// for ( PAPX papx : papxs )
// {
// System.out.println( "** " + papx );
// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
// while ( sprmIt.hasNext() )
// {
// SprmOperation sprm = sprmIt.next();
// System.out.println( "*** " + sprm.toString() );
// }
// }
// }
// for ( PAPX papx : _doc.getParagraphTable().getParagraphs() ) HWPFDocument doc = (HWPFDocument) _doc;
// {
// System.out.println( papx ); java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
// .getDeclaredField( "_mainStream" );
// if ( withProperties ) fMainStream.setAccessible( true );
// System.out.println( papx.getParagraphProperties( _doc byte[] mainStream = (byte[]) fMainStream.get( _doc );
// .getStyleSheet() ) );
// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
// if ( true ) .getFileInformationBlock().getFcPlcfbtePapx(), doc
// { .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
// while ( sprmIt.hasNext() ) List<PAPX> papxs = new ArrayList<PAPX>();
// {
// SprmOperation sprm = sprmIt.next(); int length = binTable.length();
// System.out.println( "\t" + sprm.toString() ); for ( int x = 0; x < length; x++ )
// } {
// } GenericPropertyNode node = binTable.getProperty( x );
// }
int pageNum = LittleEndian.getInt( node.getBytes() );
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
* pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
mainStream, doc.getDataStream(), pageOffset,
doc.getTextTable() );
System.out.println( "* PFKP: " + pfkp );
for ( PAPX papx : pfkp.getPAPXs() )
{
System.out.println( "** " + papx );
papxs.add( papx );
if ( papx != null && withSprms )
{
SprmIterator sprmIt = new SprmIterator(
papx.getGrpprl(), 2 );
while ( sprmIt.hasNext() )
{
SprmOperation sprm = sprmIt.next();
System.out.println( "*** " + sprm.toString() );
}
}
}
}
Collections.sort( papxs );
System.out.println( "* Sorted by END" );
for ( PAPX papx : papxs )
{
System.out.println( "** " + papx );
if ( papx != null && withSprms )
{
SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
while ( sprmIt.hasNext() )
{
SprmOperation sprm = sprmIt.next();
System.out.println( "*** " + sprm.toString() );
}
}
}
}
for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
{
System.out.println( papx );
if ( withProperties )
System.out.println( papx.getParagraphProperties( _doc
.getStyleSheet() ) );
if ( true )
{
SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
while ( sprmIt.hasNext() )
{
SprmOperation sprm = sprmIt.next();
System.out.println( "\t" + sprm.toString() );
}
}
}
} }
public void dumpParagraphs( boolean dumpAssotiatedPapx ) public void dumpParagraphs( boolean dumpAssotiatedPapx )
@ -526,8 +555,7 @@ public final class HWPFLister
} }
} }
public void dumpParagraphsDom( boolean withSprms, boolean withPapx, public void dumpParagraphsDom( boolean withText )
boolean withText )
{ {
Range range = _doc.getOverallRange(); Range range = _doc.getOverallRange();
for ( int p = 0; p < range.numParagraphs(); p++ ) for ( int p = 0; p < range.numParagraphs(); p++ )