fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;

Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
2011-09-07 12:12:17 +00:00 · 2011-09-07 12:12:17 +00:00 · d1aeec0d78
commit d1aeec0d78
parent 130a5fc04b
8 changed files with 188 additions and 57 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@
    <changes>
        <release version="3.8-beta5" date="2011-??-??">
           <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
           <action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
           <action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
           <action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@ -101,18 +101,21 @@ public class CHPBinTable
      CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
        pageOffset, translator);
-      int fkpSize = cfkp.size();
+            for ( CHPX chpx : cfkp.getCHPXs() )
-
+            {
-      for (int y = 0; y < fkpSize; y++)
+                if ( chpx != null )
-      {
+                    _textRuns.add( chpx );
-        final CHPX chpx = cfkp.getCHPX(y);
+            }
        if (chpx != null)
            _textRuns.add(chpx);
      }
    }
        logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                Integer.valueOf( _textRuns.size() ), " elements)" );
        if ( _textRuns.isEmpty() )
        {
            logger.log( POILogger.WARN, "CHPX FKPs are empty" );
            _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
        }
    }
    public void rebuild( ComplexFileTable complexFileTable )
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
@ -18,6 +18,7 @@
 package org.apache.poi.hwpf.model;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
            int bytesStartAt = getStart( x );
            int bytesEndAt = getEnd( x );
-            int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
            // );
-            // TODO: CHECK!
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
-            // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
+                    bytesEndAt ) )
-            // ) );
+            {
-            CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
+                CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
-                    getGrpprl( x ), 0 ) );
+                        getGrpprl( x ), 0 ) );
-            _chpxList.add( chpx );
+                _chpxList.add( chpx );
            }
        }
    }
@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
      return _chpxList.get(index);
    }
    public List<CHPX> getCHPXs()
    {
        return Collections.unmodifiableList( _chpxList );
    }
    public void fill(List<CHPX> filler)
    {
      _chpxList.addAll(filler);
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
@ -31,12 +31,16 @@ public interface CharIndexTranslator {
    int getByteIndex( int charPos );
    /**
-     * Calculates the char index of the given byte index.
+     * Calculates the char index of the given byte index. Look forward if index
-     * Look forward if index is not in table
+     * is not in table
     * 
-     * @param bytePos The character offset to check 
+     * @param bytePos
     *            The character offset to check
     * @return the char index
     * @deprecated This API were based on incorrect assumption that single byte
     *             offset corresponds to single char offset
     */
    @Deprecated
    int getCharIndex(int bytePos);
    /**
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
     * @param bytePos The character offset to check
     * @param startCP look from this characted position 
     * @return the char index
     * @deprecated This API were based on incorrect assumption that single byte
     *             offset corresponds to single char offset
     */
    @Deprecated
    int getCharIndex(int bytePos, int startCP);
    /**
     * Finds character ranges that includes specified byte range.
     * 
     * @param startBytePosInclusive
     *            start byte range
     * @param endBytePosExclusive
     *            end byte range
     */
    int[][] getCharIndexRanges( int startBytePosInclusive,
            int endBytePosExclusive );
    /**
     * Check if index is in table
     * 
     * @param bytePos
     * @return true if index in table, false if not
     */
    boolean isIndexInTable(int bytePos);
    /**
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@ -92,12 +92,8 @@ public class PAPBinTable
                        documentStream, dataStream, pageOffset,
                        charIndexTranslator );
-                int fkpSize = pfkp.size();
+                for ( PAPX papx : pfkp.getPAPXs() )
                for ( int y = 0; y < fkpSize; y++ )
                {
                    PAPX papx = pfkp.getPAPX( y );
                    if ( papx != null )
                        _paragraphs.add( papx );
                }
@ -107,6 +103,12 @@ public class PAPBinTable
        logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                Integer.valueOf( _paragraphs.size() ), " elements)" );
        if ( _paragraphs.isEmpty() )
        {
            logger.log( POILogger.WARN, "PAPX FKPs are empty" );
            _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
        }
    }
    public void rebuild( final StringBuilder docText,
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
@ -23,6 +23,8 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
            int bytesStartAt = getStart( x );
            int bytesEndAt = getEnd( x );
-            int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
            // );
            // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
            // getParagraphHeight( x ), dataStream );
            // _papxList.add( papx );
-            PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
-                    getParagraphHeight( x ), dataStream );
+                    bytesEndAt ) )
-            _papxList.add( papx );
+            {
                PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
                        getParagraphHeight( x ), dataStream );
                _papxList.add( papx );
            }
        }
        _fkp = null;
    }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.LinkedList;
 import java.util.List;
 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
            System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
            // And now build the piece
-            _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
+            final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
-                    pieces[x] ) );
+                    pieces[x] );
            _textPieces.add( newTextPiece );
        }
        // In the interest of our sanity, now sort the text pieces
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
        return byteCount;
    }
    @Deprecated
    public int getCharIndex( int bytePos )
    {
        return getCharIndex( bytePos, 0 );
    }
    @Deprecated
    public int getCharIndex( int startBytePos, int startCP )
    {
        int charCount = 0;
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
        return charCount;
    }
    public int[][] getCharIndexRanges( int startBytePosInclusive,
            int endBytePosExclusive )
    {
        List<int[]> result = new LinkedList<int[]>();
        for ( TextPiece textPiece : _textPiecesFCOrder )
        {
            final int tpStart = textPiece.getPieceDescriptor()
                    .getFilePosition();
            final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
                    + textPiece.bytesLength();
            if ( startBytePosInclusive > tpEnd )
                continue;
            if ( endBytePosExclusive < tpStart )
                break;
            final int rangeStartBytes = Math.max( tpStart,
                    startBytePosInclusive );
            final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
            final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
            if ( rangeStartBytes > rangeEndBytes )
                continue;
            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
            final int rangeStartCp = textPiece.getStart()
                    + ( rangeStartBytes - tpStart ) / encodingMultiplier;
            final int rangeEndCp = rangeStartCp + rangeLengthBytes
                    / encodingMultiplier;
            result.add( new int[] { rangeStartCp, rangeEndCp } );
        }
        return result.toArray( new int[result.size()][] );
    }
    public int getCpMin()
    {
        return _cpMin;
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
    public int lookIndexForward( final int startBytePos )
    {
-        int bytePos = startBytePos;
+        if ( _textPiecesFCOrder.isEmpty() )
-        for ( TextPiece tp : _textPiecesFCOrder )
+            throw new IllegalStateException( "Text pieces table is empty" );
        if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
            return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
        if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
                .getPieceDescriptor().getFilePosition() <= startBytePos )
            return startBytePos;
        int low = 0;
        int high = _textPiecesFCOrder.size() - 1;
        while ( low <= high )
        {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
+            int mid = ( low + high ) >>> 1;
            final TextPiece textPiece = _textPiecesFCOrder.get( mid );
            int midVal = textPiece.getPieceDescriptor().getFilePosition();
-            if ( bytePos >= pieceStart + tp.bytesLength() )
+            if ( midVal < startBytePos )
-            {
+                low = mid + 1;
-                continue;
+            else if ( midVal > startBytePos )
-            }
+                high = mid - 1;
-
+            else
-            if ( pieceStart > bytePos )
+                // found piece with exact start
-            {
+                return textPiece.getPieceDescriptor().getFilePosition();
                bytePos = pieceStart;
            }
            break;
        }
-        return bytePos;
+        assert low == high;
        assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
                .getFilePosition() < startBytePos;
        // last line can't be current, can it?
        assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
                .getFilePosition() > startBytePos;
        // shifting to next piece start
        return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
    }
    public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@ -226,6 +226,36 @@ public class TestBugs extends TestCase
        assertEquals( extractor1.getText(), extractor2.getText() );
    }
    /**
     * Bug 44331 - HWPFDocument.write destroys fields
     */
    public void test44431_2()
    {
        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
        WordExtractor extractor1 = new WordExtractor( doc1 );
        assertEquals( "File name=FieldsTest.doc\n" + 
        		"\n" + 
        		"\n" + 
        		"STYLEREF test\n" + 
        		"\n" + 
        		"\n" + 
        		"\n" + 
        		"TEST TABLE OF CONTENTS\n" + 
        		"\n" + 
        		"Heading paragraph in next page\t2\n" + 
        		"Another heading paragraph in further page\t3\n" + 
        		"Another heading paragraph in further page\t3\n" + 
        		"\n" + 
        		"\n" + 
        		"Heading paragraph in next page\n" + 
        		"Another heading paragraph in further page\n" + 
        		"\n" + 
        		"\n" + 
        		"\n" + 
        		"Page 3 of 3", extractor1.getText() );
    }
    /**
     * Bug 45473 - HWPF cannot read file after save
     */
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
        hwpfDocument.write( new ByteArrayOutputStream() );
    }
    /**
-     * Bug 51678 - Extracting text from Bug51524.zip is slow
+     * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
-     * Bug 51524 - PapBinTable constructor is slow
+     * PapBinTable constructor is slow
     */
    public void test51678And51524()
    {
-        // YK: the test will run only if the poi.test.remote system property is set.
+        // YK: the test will run only if the poi.test.remote system property is
        // set.
        // TODO: refactor into something nicer!
-        if(System.getProperty("poi.test.remote") != null) {
+        if ( System.getProperty( "poi.test.remote" ) != null )
        {
            String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
-            HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+            HWPFDocument hwpfDocument = HWPFTestDataSamples
                    .openRemoteFile( href );
            WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
            wordExtractor.getText();