different workarounds for old Word format

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1195133 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-10-30 08:59:16 +00:00
parent ec46537ca0
commit 5a930ae36f
7 changed files with 55 additions and 16 deletions

View File

@ -80,7 +80,7 @@ import org.apache.poi.util.Internal;
*/ */
public final class HWPFDocument extends HWPFDocumentCore public final class HWPFDocument extends HWPFDocumentCore
{ {
private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables"; static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable"; private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
private static final String STREAM_DATA = "Data"; private static final String STREAM_DATA = "Data";

View File

@ -66,9 +66,10 @@ public class HWPFOldDocument extends HWPFDocumentCore {
// We need to get hold of the text that makes up the // We need to get hold of the text that makes up the
// document, which might be regular or fast-saved // document, which might be regular or fast-saved
ComplexFileTable cft = null;
StringBuffer text = new StringBuffer(); StringBuffer text = new StringBuffer();
if(_fib.getFibBase().isFComplex()) { if(_fib.getFibBase().isFComplex()) {
ComplexFileTable cft = new ComplexFileTable( cft = new ComplexFileTable(
_mainStream, _mainStream, _mainStream, _mainStream,
complexTableOffset, _fib.getFibBase().getFcMin() complexTableOffset, _fib.getFibBase().getFcMin()
); );
@ -113,6 +114,27 @@ public class HWPFOldDocument extends HWPFDocumentCore {
_mainStream, sedTableOffset, sedTableSize, _mainStream, sedTableOffset, sedTableSize,
_fib.getFibBase().getFcMin(), tpt _fib.getFibBase().getFcMin(), tpt
); );
/*
* in this mode we preserving PAPX/CHPX structure from file, so text may
* miss from output, and text order may be corrupted
*/
boolean preserveBinTables = false;
try
{
preserveBinTables = Boolean.parseBoolean( System
.getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
}
catch ( Exception exc )
{
// ignore;
}
if ( !preserveBinTables )
{
_cbt.rebuild( cft );
_pbt.rebuild( _text, cft );
}
} }
public Range getOverallRange() public Range getOverallRange()

View File

@ -17,8 +17,6 @@
package org.apache.poi.hwpf.model; package org.apache.poi.hwpf.model;
import java.util.Collections;
import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.Internal; import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
@ -57,7 +55,5 @@ public final class OldPAPBinTable extends PAPBinTable
_paragraphs.add( papx ); _paragraphs.add( papx );
} }
} }
Collections.sort( _paragraphs, PropertyNode.StartComparator.instance );
} }
} }

View File

@ -113,6 +113,12 @@ public class PAPBinTable
public void rebuild( final StringBuilder docText, public void rebuild( final StringBuilder docText,
ComplexFileTable complexFileTable ) ComplexFileTable complexFileTable )
{
rebuild( docText, complexFileTable, _paragraphs );
}
static void rebuild( final StringBuilder docText,
ComplexFileTable complexFileTable, List<PAPX> paragraphs )
{ {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
@ -156,19 +162,19 @@ public class PAPBinTable
PAPX papx = new PAPX( textPiece.getStart(), PAPX papx = new PAPX( textPiece.getStart(),
textPiece.getEnd(), newSprmBuffer ); textPiece.getEnd(), newSprmBuffer );
_paragraphs.add( papx ); paragraphs.add( papx );
} }
} }
logger.log( POILogger.DEBUG, logger.log( POILogger.DEBUG,
"Merged (?) with PAPX from complex file table in ", "Merged (?) with PAPX from complex file table in ",
Long.valueOf( System.currentTimeMillis() - start ), Long.valueOf( System.currentTimeMillis() - start ),
" ms (", Integer.valueOf( _paragraphs.size() ), " ms (", Integer.valueOf( paragraphs.size() ),
" elements in total)" ); " elements in total)" );
start = System.currentTimeMillis(); start = System.currentTimeMillis();
} }
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs ); List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( paragraphs );
Collections.sort( oldPapxSortedByEndPos, Collections.sort( oldPapxSortedByEndPos,
PropertyNode.EndComparator.instance ); PropertyNode.EndComparator.instance );
@ -179,7 +185,7 @@ public class PAPBinTable
final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>(); final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
{ {
int counter = 0; int counter = 0;
for ( PAPX papx : _paragraphs ) for ( PAPX papx : paragraphs )
{ {
papxToFileOrder.put( papx, Integer.valueOf( counter++ ) ); papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
} }
@ -270,6 +276,9 @@ public class PAPBinTable
SprmBuffer sprmBuffer = null; SprmBuffer sprmBuffer = null;
for ( PAPX papx : papxs ) for ( PAPX papx : papxs )
{ {
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
continue;
if ( sprmBuffer == null ) if ( sprmBuffer == null )
try try
{ {
@ -281,7 +290,9 @@ public class PAPBinTable
throw new Error( e ); throw new Error( e );
} }
else else
{
sprmBuffer.append( papx.getGrpprl(), 2 ); sprmBuffer.append( papx.getGrpprl(), 2 );
}
} }
PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer ); PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
newPapxs.add( newPapx ); newPapxs.add( newPapx );
@ -289,11 +300,12 @@ public class PAPBinTable
lastParStart = endExclusive; lastParStart = endExclusive;
continue; continue;
} }
this._paragraphs = new ArrayList<PAPX>( newPapxs ); paragraphs.clear();
paragraphs.addAll( newPapxs );
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ", logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (", Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" ); Integer.valueOf( paragraphs.size() ), " elements)" );
start = System.currentTimeMillis(); start = System.currentTimeMillis();
} }

View File

@ -112,11 +112,17 @@ public final class PAPX extends BytePropertyNode<PAPX> {
public byte[] getGrpprl() public byte[] getGrpprl()
{ {
if (_buf == null)
return new byte[0];
return ((SprmBuffer)_buf).toByteArray(); return ((SprmBuffer)_buf).toByteArray();
} }
public short getIstd() public short getIstd()
{ {
if ( _buf == null )
return 0;
byte[] buf = getGrpprl(); byte[] buf = getGrpprl();
if (buf.length == 0) if (buf.length == 0)
{ {

View File

@ -1101,7 +1101,7 @@ public class Range { // TODO -instantiable superclass
int endIndex = binarySearchEnd( rpl, startIndex, end ); int endIndex = binarySearchEnd( rpl, startIndex, end );
while ( endIndex < rpl.size() - 1 while ( endIndex < rpl.size() - 1
&& rpl.get( endIndex + 1 ).getEnd() <= end ) && rpl.get( endIndex + 1 ).getEnd() <= end )
endIndex--; endIndex++;
if ( startIndex < 0 || startIndex >= rpl.size() if ( startIndex < 0 || startIndex >= rpl.size()
|| startIndex > endIndex || endIndex < 0 || startIndex > endIndex || endIndex < 0

View File

@ -28,6 +28,8 @@ import java.util.List;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
@ -736,7 +738,8 @@ public class TestBugs extends TestCase
*/ */
public void testBug51944() throws Exception public void testBug51944() throws Exception
{ {
HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" ); HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" );
WordToTextConverter.getText( doc );
} }
/** /**