different workarounds for old Word format
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1195133 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec46537ca0
commit
5a930ae36f
@ -80,7 +80,7 @@ import org.apache.poi.util.Internal;
|
|||||||
*/
|
*/
|
||||||
public final class HWPFDocument extends HWPFDocumentCore
|
public final class HWPFDocument extends HWPFDocumentCore
|
||||||
{
|
{
|
||||||
private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
|
static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
|
||||||
private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
|
private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
|
||||||
|
|
||||||
private static final String STREAM_DATA = "Data";
|
private static final String STREAM_DATA = "Data";
|
||||||
|
@ -66,9 +66,10 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||||||
|
|
||||||
// We need to get hold of the text that makes up the
|
// We need to get hold of the text that makes up the
|
||||||
// document, which might be regular or fast-saved
|
// document, which might be regular or fast-saved
|
||||||
|
ComplexFileTable cft = null;
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
if(_fib.getFibBase().isFComplex()) {
|
if(_fib.getFibBase().isFComplex()) {
|
||||||
ComplexFileTable cft = new ComplexFileTable(
|
cft = new ComplexFileTable(
|
||||||
_mainStream, _mainStream,
|
_mainStream, _mainStream,
|
||||||
complexTableOffset, _fib.getFibBase().getFcMin()
|
complexTableOffset, _fib.getFibBase().getFcMin()
|
||||||
);
|
);
|
||||||
@ -113,6 +114,27 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||||||
_mainStream, sedTableOffset, sedTableSize,
|
_mainStream, sedTableOffset, sedTableSize,
|
||||||
_fib.getFibBase().getFcMin(), tpt
|
_fib.getFibBase().getFcMin(), tpt
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* in this mode we preserving PAPX/CHPX structure from file, so text may
|
||||||
|
* miss from output, and text order may be corrupted
|
||||||
|
*/
|
||||||
|
boolean preserveBinTables = false;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
preserveBinTables = Boolean.parseBoolean( System
|
||||||
|
.getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
|
||||||
|
}
|
||||||
|
catch ( Exception exc )
|
||||||
|
{
|
||||||
|
// ignore;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !preserveBinTables )
|
||||||
|
{
|
||||||
|
_cbt.rebuild( cft );
|
||||||
|
_pbt.rebuild( _text, cft );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Range getOverallRange()
|
public Range getOverallRange()
|
||||||
|
@ -17,8 +17,6 @@
|
|||||||
|
|
||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
import org.apache.poi.poifs.common.POIFSConstants;
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
@ -57,7 +55,5 @@ public final class OldPAPBinTable extends PAPBinTable
|
|||||||
_paragraphs.add( papx );
|
_paragraphs.add( papx );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Collections.sort( _paragraphs, PropertyNode.StartComparator.instance );
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,6 +113,12 @@ public class PAPBinTable
|
|||||||
|
|
||||||
public void rebuild( final StringBuilder docText,
|
public void rebuild( final StringBuilder docText,
|
||||||
ComplexFileTable complexFileTable )
|
ComplexFileTable complexFileTable )
|
||||||
|
{
|
||||||
|
rebuild( docText, complexFileTable, _paragraphs );
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rebuild( final StringBuilder docText,
|
||||||
|
ComplexFileTable complexFileTable, List<PAPX> paragraphs )
|
||||||
{
|
{
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
@ -156,19 +162,19 @@ public class PAPBinTable
|
|||||||
|
|
||||||
PAPX papx = new PAPX( textPiece.getStart(),
|
PAPX papx = new PAPX( textPiece.getStart(),
|
||||||
textPiece.getEnd(), newSprmBuffer );
|
textPiece.getEnd(), newSprmBuffer );
|
||||||
_paragraphs.add( papx );
|
paragraphs.add( papx );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.log( POILogger.DEBUG,
|
logger.log( POILogger.DEBUG,
|
||||||
"Merged (?) with PAPX from complex file table in ",
|
"Merged (?) with PAPX from complex file table in ",
|
||||||
Long.valueOf( System.currentTimeMillis() - start ),
|
Long.valueOf( System.currentTimeMillis() - start ),
|
||||||
" ms (", Integer.valueOf( _paragraphs.size() ),
|
" ms (", Integer.valueOf( paragraphs.size() ),
|
||||||
" elements in total)" );
|
" elements in total)" );
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
|
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( paragraphs );
|
||||||
Collections.sort( oldPapxSortedByEndPos,
|
Collections.sort( oldPapxSortedByEndPos,
|
||||||
PropertyNode.EndComparator.instance );
|
PropertyNode.EndComparator.instance );
|
||||||
|
|
||||||
@ -179,7 +185,7 @@ public class PAPBinTable
|
|||||||
final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
|
final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
|
||||||
{
|
{
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
for ( PAPX papx : _paragraphs )
|
for ( PAPX papx : paragraphs )
|
||||||
{
|
{
|
||||||
papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
|
papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
|
||||||
}
|
}
|
||||||
@ -270,6 +276,9 @@ public class PAPBinTable
|
|||||||
SprmBuffer sprmBuffer = null;
|
SprmBuffer sprmBuffer = null;
|
||||||
for ( PAPX papx : papxs )
|
for ( PAPX papx : papxs )
|
||||||
{
|
{
|
||||||
|
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
|
||||||
|
continue;
|
||||||
|
|
||||||
if ( sprmBuffer == null )
|
if ( sprmBuffer == null )
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
@ -281,7 +290,9 @@ public class PAPBinTable
|
|||||||
throw new Error( e );
|
throw new Error( e );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
sprmBuffer.append( papx.getGrpprl(), 2 );
|
sprmBuffer.append( papx.getGrpprl(), 2 );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
|
PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
|
||||||
newPapxs.add( newPapx );
|
newPapxs.add( newPapx );
|
||||||
@ -289,11 +300,12 @@ public class PAPBinTable
|
|||||||
lastParStart = endExclusive;
|
lastParStart = endExclusive;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
this._paragraphs = new ArrayList<PAPX>( newPapxs );
|
paragraphs.clear();
|
||||||
|
paragraphs.addAll( newPapxs );
|
||||||
|
|
||||||
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
|
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
|
||||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||||
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
Integer.valueOf( paragraphs.size() ), " elements)" );
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,11 +112,17 @@ public final class PAPX extends BytePropertyNode<PAPX> {
|
|||||||
|
|
||||||
public byte[] getGrpprl()
|
public byte[] getGrpprl()
|
||||||
{
|
{
|
||||||
|
if (_buf == null)
|
||||||
|
return new byte[0];
|
||||||
|
|
||||||
return ((SprmBuffer)_buf).toByteArray();
|
return ((SprmBuffer)_buf).toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
public short getIstd()
|
public short getIstd()
|
||||||
{
|
{
|
||||||
|
if ( _buf == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
byte[] buf = getGrpprl();
|
byte[] buf = getGrpprl();
|
||||||
if (buf.length == 0)
|
if (buf.length == 0)
|
||||||
{
|
{
|
||||||
|
@ -1101,7 +1101,7 @@ public class Range { // TODO -instantiable superclass
|
|||||||
int endIndex = binarySearchEnd( rpl, startIndex, end );
|
int endIndex = binarySearchEnd( rpl, startIndex, end );
|
||||||
while ( endIndex < rpl.size() - 1
|
while ( endIndex < rpl.size() - 1
|
||||||
&& rpl.get( endIndex + 1 ).getEnd() <= end )
|
&& rpl.get( endIndex + 1 ).getEnd() <= end )
|
||||||
endIndex--;
|
endIndex++;
|
||||||
|
|
||||||
if ( startIndex < 0 || startIndex >= rpl.size()
|
if ( startIndex < 0 || startIndex >= rpl.size()
|
||||||
|| startIndex > endIndex || endIndex < 0
|
|| startIndex > endIndex || endIndex < 0
|
||||||
|
@ -28,6 +28,8 @@ import java.util.List;
|
|||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.converter.WordToTextConverter;
|
||||||
|
|
||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
@ -736,7 +738,8 @@ public class TestBugs extends TestCase
|
|||||||
*/
|
*/
|
||||||
public void testBug51944() throws Exception
|
public void testBug51944() throws Exception
|
||||||
{
|
{
|
||||||
HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" );
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" );
|
||||||
|
WordToTextConverter.getText( doc );
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
Reference in New Issue
Block a user