fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;

Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-09-07 12:12:17 +00:00
parent 130a5fc04b
commit d1aeec0d78
8 changed files with 188 additions and 57 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.8-beta5" date="2011-??-??"> <release version="3.8-beta5" date="2011-??-??">
<action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action> <action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action> <action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action> <action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>

View File

@ -101,18 +101,21 @@ public class CHPBinTable
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
pageOffset, translator); pageOffset, translator);
int fkpSize = cfkp.size(); for ( CHPX chpx : cfkp.getCHPXs() )
{
for (int y = 0; y < fkpSize; y++) if ( chpx != null )
{ _textRuns.add( chpx );
final CHPX chpx = cfkp.getCHPX(y); }
if (chpx != null)
_textRuns.add(chpx);
}
} }
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ", logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (", Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _textRuns.size() ), " elements)" ); Integer.valueOf( _textRuns.size() ), " elements)" );
if ( _textRuns.isEmpty() )
{
logger.log( POILogger.WARN, "CHPX FKPs are empty" );
_textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
}
} }
public void rebuild( ComplexFileTable complexFileTable ) public void rebuild( ComplexFileTable complexFileTable )

View File

@ -18,6 +18,7 @@
package org.apache.poi.hwpf.model; package org.apache.poi.hwpf.model;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer; import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
int bytesStartAt = getStart( x ); int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x ); int bytesEndAt = getEnd( x );
int charStartAt = translator.getCharIndex( bytesStartAt ); // int charStartAt = translator.getCharIndex( bytesStartAt );
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt ); // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
// );
// TODO: CHECK! for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
// CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x bytesEndAt ) )
// ) ); {
CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer( CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
getGrpprl( x ), 0 ) ); getGrpprl( x ), 0 ) );
_chpxList.add( chpx ); _chpxList.add( chpx );
}
} }
} }
@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
return _chpxList.get(index); return _chpxList.get(index);
} }
public List<CHPX> getCHPXs()
{
return Collections.unmodifiableList( _chpxList );
}
public void fill(List<CHPX> filler) public void fill(List<CHPX> filler)
{ {
_chpxList.addAll(filler); _chpxList.addAll(filler);

View File

@ -31,12 +31,16 @@ public interface CharIndexTranslator {
int getByteIndex( int charPos ); int getByteIndex( int charPos );
/** /**
* Calculates the char index of the given byte index. * Calculates the char index of the given byte index. Look forward if index
* Look forward if index is not in table * is not in table
* *
* @param bytePos The character offset to check * @param bytePos
* The character offset to check
* @return the char index * @return the char index
* @deprecated This API were based on incorrect assumption that single byte
* offset corresponds to single char offset
*/ */
@Deprecated
int getCharIndex(int bytePos); int getCharIndex(int bytePos);
/** /**
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
* @param bytePos The character offset to check * @param bytePos The character offset to check
* @param startCP look from this characted position * @param startCP look from this characted position
* @return the char index * @return the char index
* @deprecated This API were based on incorrect assumption that single byte
* offset corresponds to single char offset
*/ */
@Deprecated
int getCharIndex(int bytePos, int startCP); int getCharIndex(int bytePos, int startCP);
/**
* Finds character ranges that includes specified byte range.
*
* @param startBytePosInclusive
* start byte range
* @param endBytePosExclusive
* end byte range
*/
int[][] getCharIndexRanges( int startBytePosInclusive,
int endBytePosExclusive );
/** /**
* Check if index is in table * Check if index is in table
* *
* @param bytePos * @param bytePos
* @return true if index in table, false if not * @return true if index in table, false if not
*/ */
boolean isIndexInTable(int bytePos); boolean isIndexInTable(int bytePos);
/** /**

View File

@ -92,12 +92,8 @@ public class PAPBinTable
documentStream, dataStream, pageOffset, documentStream, dataStream, pageOffset,
charIndexTranslator ); charIndexTranslator );
int fkpSize = pfkp.size(); for ( PAPX papx : pfkp.getPAPXs() )
for ( int y = 0; y < fkpSize; y++ )
{ {
PAPX papx = pfkp.getPAPX( y );
if ( papx != null ) if ( papx != null )
_paragraphs.add( papx ); _paragraphs.add( papx );
} }
@ -107,6 +103,12 @@ public class PAPBinTable
logger.log( POILogger.DEBUG, "PAPX tables loaded in ", logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (", Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" ); Integer.valueOf( _paragraphs.size() ), " elements)" );
if ( _paragraphs.isEmpty() )
{
logger.log( POILogger.WARN, "PAPX FKPs are empty" );
_paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
}
} }
public void rebuild( final StringBuilder docText, public void rebuild( final StringBuilder docText,

View File

@ -23,6 +23,8 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.util.Internal; import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
int bytesStartAt = getStart( x ); int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x ); int bytesEndAt = getEnd( x );
int charStartAt = translator.getCharIndex( bytesStartAt ); // int charStartAt = translator.getCharIndex( bytesStartAt );
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt ); // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
// );
// PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
// getParagraphHeight( x ), dataStream );
// _papxList.add( papx );
PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ), for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
getParagraphHeight( x ), dataStream ); bytesEndAt ) )
_papxList.add( papx ); {
PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
getParagraphHeight( x ), dataStream );
_papxList.add( papx );
}
} }
_fkp = null; _fkp = null;
} }

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
System.arraycopy( documentStream, start, buf, 0, textSizeBytes ); System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
// And now build the piece // And now build the piece
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf, final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
pieces[x] ) ); pieces[x] );
_textPieces.add( newTextPiece );
} }
// In the interest of our sanity, now sort the text pieces // In the interest of our sanity, now sort the text pieces
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
return byteCount; return byteCount;
} }
@Deprecated
public int getCharIndex( int bytePos ) public int getCharIndex( int bytePos )
{ {
return getCharIndex( bytePos, 0 ); return getCharIndex( bytePos, 0 );
} }
@Deprecated
public int getCharIndex( int startBytePos, int startCP ) public int getCharIndex( int startBytePos, int startCP )
{ {
int charCount = 0; int charCount = 0;
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
return charCount; return charCount;
} }
public int[][] getCharIndexRanges( int startBytePosInclusive,
int endBytePosExclusive )
{
List<int[]> result = new LinkedList<int[]>();
for ( TextPiece textPiece : _textPiecesFCOrder )
{
final int tpStart = textPiece.getPieceDescriptor()
.getFilePosition();
final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+ textPiece.bytesLength();
if ( startBytePosInclusive > tpEnd )
continue;
if ( endBytePosExclusive < tpStart )
break;
final int rangeStartBytes = Math.max( tpStart,
startBytePosInclusive );
final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
if ( rangeStartBytes > rangeEndBytes )
continue;
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
final int rangeStartCp = textPiece.getStart()
+ ( rangeStartBytes - tpStart ) / encodingMultiplier;
final int rangeEndCp = rangeStartCp + rangeLengthBytes
/ encodingMultiplier;
result.add( new int[] { rangeStartCp, rangeEndCp } );
}
return result.toArray( new int[result.size()][] );
}
public int getCpMin() public int getCpMin()
{ {
return _cpMin; return _cpMin;
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
public int lookIndexForward( final int startBytePos ) public int lookIndexForward( final int startBytePos )
{ {
int bytePos = startBytePos; if ( _textPiecesFCOrder.isEmpty() )
for ( TextPiece tp : _textPiecesFCOrder ) throw new IllegalStateException( "Text pieces table is empty" );
if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
.getPieceDescriptor().getFilePosition() <= startBytePos )
return startBytePos;
int low = 0;
int high = _textPiecesFCOrder.size() - 1;
while ( low <= high )
{ {
int pieceStart = tp.getPieceDescriptor().getFilePosition(); int mid = ( low + high ) >>> 1;
final TextPiece textPiece = _textPiecesFCOrder.get( mid );
int midVal = textPiece.getPieceDescriptor().getFilePosition();
if ( bytePos >= pieceStart + tp.bytesLength() ) if ( midVal < startBytePos )
{ low = mid + 1;
continue; else if ( midVal > startBytePos )
} high = mid - 1;
else
if ( pieceStart > bytePos ) // found piece with exact start
{ return textPiece.getPieceDescriptor().getFilePosition();
bytePos = pieceStart;
}
break;
} }
return bytePos; assert low == high;
assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
.getFilePosition() < startBytePos;
// last line can't be current, can it?
assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
.getFilePosition() > startBytePos;
// shifting to next piece start
return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
} }
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException public byte[] writeTo( HWPFOutputStream docStream ) throws IOException

View File

@ -226,6 +226,36 @@ public class TestBugs extends TestCase
assertEquals( extractor1.getText(), extractor2.getText() ); assertEquals( extractor1.getText(), extractor2.getText() );
} }
/**
* Bug 44331 - HWPFDocument.write destroys fields
*/
public void test44431_2()
{
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
WordExtractor extractor1 = new WordExtractor( doc1 );
assertEquals( "File name=FieldsTest.doc\n" +
"\n" +
"\n" +
"STYLEREF test\n" +
"\n" +
"\n" +
"\n" +
"TEST TABLE OF CONTENTS\n" +
"\n" +
"Heading paragraph in next page\t2\n" +
"Another heading paragraph in further page\t3\n" +
"Another heading paragraph in further page\t3\n" +
"\n" +
"\n" +
"Heading paragraph in next page\n" +
"Another heading paragraph in further page\n" +
"\n" +
"\n" +
"\n" +
"Page 3 of 3", extractor1.getText() );
}
/** /**
* Bug 45473 - HWPF cannot read file after save * Bug 45473 - HWPF cannot read file after save
*/ */
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
hwpfDocument.write( new ByteArrayOutputStream() ); hwpfDocument.write( new ByteArrayOutputStream() );
} }
/** /**
* Bug 51678 - Extracting text from Bug51524.zip is slow * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
* Bug 51524 - PapBinTable constructor is slow * PapBinTable constructor is slow
*/ */
public void test51678And51524() public void test51678And51524()
{ {
// YK: the test will run only if the poi.test.remote system property is set. // YK: the test will run only if the poi.test.remote system property is
// set.
// TODO: refactor into something nicer! // TODO: refactor into something nicer!
if(System.getProperty("poi.test.remote") != null) { if ( System.getProperty( "poi.test.remote" ) != null )
{
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc"; String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href ); HWPFDocument hwpfDocument = HWPFTestDataSamples
.openRemoteFile( href );
WordExtractor wordExtractor = new WordExtractor( hwpfDocument ); WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
wordExtractor.getText(); wordExtractor.getText();