different workarounds for old Word format

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1195133 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-10-30 08:59:16 +00:00
parent ec46537ca0
commit 5a930ae36f
7 changed files with 55 additions and 16 deletions

View File

@ -80,7 +80,7 @@ import org.apache.poi.util.Internal;
*/
public final class HWPFDocument extends HWPFDocumentCore
{
private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
private static final String STREAM_DATA = "Data";

View File

@ -66,9 +66,10 @@ public class HWPFOldDocument extends HWPFDocumentCore {
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
ComplexFileTable cft = null;
StringBuffer text = new StringBuffer();
if(_fib.getFibBase().isFComplex()) {
ComplexFileTable cft = new ComplexFileTable(
cft = new ComplexFileTable(
_mainStream, _mainStream,
complexTableOffset, _fib.getFibBase().getFcMin()
);
@ -113,6 +114,27 @@ public class HWPFOldDocument extends HWPFDocumentCore {
_mainStream, sedTableOffset, sedTableSize,
_fib.getFibBase().getFcMin(), tpt
);
/*
* in this mode we preserving PAPX/CHPX structure from file, so text may
* miss from output, and text order may be corrupted
*/
boolean preserveBinTables = false;
try
{
preserveBinTables = Boolean.parseBoolean( System
.getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
}
catch ( Exception exc )
{
// ignore;
}
if ( !preserveBinTables )
{
_cbt.rebuild( cft );
_pbt.rebuild( _text, cft );
}
}
public Range getOverallRange()

View File

@ -17,8 +17,6 @@
package org.apache.poi.hwpf.model;
import java.util.Collections;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
@ -57,7 +55,5 @@ public final class OldPAPBinTable extends PAPBinTable
_paragraphs.add( papx );
}
}
Collections.sort( _paragraphs, PropertyNode.StartComparator.instance );
}
}

View File

@ -113,6 +113,12 @@ public class PAPBinTable
public void rebuild( final StringBuilder docText,
ComplexFileTable complexFileTable )
{
rebuild( docText, complexFileTable, _paragraphs );
}
static void rebuild( final StringBuilder docText,
ComplexFileTable complexFileTable, List<PAPX> paragraphs )
{
long start = System.currentTimeMillis();
@ -156,19 +162,19 @@ public class PAPBinTable
PAPX papx = new PAPX( textPiece.getStart(),
textPiece.getEnd(), newSprmBuffer );
_paragraphs.add( papx );
paragraphs.add( papx );
}
}
logger.log( POILogger.DEBUG,
"Merged (?) with PAPX from complex file table in ",
Long.valueOf( System.currentTimeMillis() - start ),
" ms (", Integer.valueOf( _paragraphs.size() ),
" ms (", Integer.valueOf( paragraphs.size() ),
" elements in total)" );
start = System.currentTimeMillis();
}
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( paragraphs );
Collections.sort( oldPapxSortedByEndPos,
PropertyNode.EndComparator.instance );
@ -179,7 +185,7 @@ public class PAPBinTable
final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
{
int counter = 0;
for ( PAPX papx : _paragraphs )
for ( PAPX papx : paragraphs )
{
papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
}
@ -270,6 +276,9 @@ public class PAPBinTable
SprmBuffer sprmBuffer = null;
for ( PAPX papx : papxs )
{
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
continue;
if ( sprmBuffer == null )
try
{
@ -281,19 +290,22 @@ public class PAPBinTable
throw new Error( e );
}
else
{
sprmBuffer.append( papx.getGrpprl(), 2 );
}
}
PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
newPapxs.add( newPapx );
lastParStart = endExclusive;
continue;
}
this._paragraphs = new ArrayList<PAPX>( newPapxs );
paragraphs.clear();
paragraphs.addAll( newPapxs );
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" );
Integer.valueOf( paragraphs.size() ), " elements)" );
start = System.currentTimeMillis();
}

View File

@ -112,11 +112,17 @@ public final class PAPX extends BytePropertyNode<PAPX> {
public byte[] getGrpprl()
{
if (_buf == null)
return new byte[0];
return ((SprmBuffer)_buf).toByteArray();
}
public short getIstd()
{
if ( _buf == null )
return 0;
byte[] buf = getGrpprl();
if (buf.length == 0)
{

View File

@ -1101,7 +1101,7 @@ public class Range { // TODO -instantiable superclass
int endIndex = binarySearchEnd( rpl, startIndex, end );
while ( endIndex < rpl.size() - 1
&& rpl.get( endIndex + 1 ).getEnd() <= end )
endIndex--;
endIndex++;
if ( startIndex < 0 || startIndex >= rpl.size()
|| startIndex > endIndex || endIndex < 0

View File

@ -28,6 +28,8 @@ import java.util.List;
import junit.framework.TestCase;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument;
@ -736,7 +738,8 @@ public class TestBugs extends TestCase
*/
public void testBug51944() throws Exception
{
HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" );
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" );
WordToTextConverter.getText( doc );
}
/**