fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;

Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
2011-09-07 12:12:17 +00:00 · 2011-09-07 12:12:17 +00:00 · d1aeec0d78
commit d1aeec0d78
parent 130a5fc04b
8 changed files with 188 additions and 57 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@

    <changes>
        <release version="3.8-beta5" date="2011-??-??">
+           <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
           <action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
           <action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
           <action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@ -101,18 +101,21 @@ public class CHPBinTable
      CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
        pageOffset, translator);

-      int fkpSize = cfkp.size();
-
-      for (int y = 0; y < fkpSize; y++)
+            for ( CHPX chpx : cfkp.getCHPXs() )
            {
-        final CHPX chpx = cfkp.getCHPX(y);
-        if (chpx != null)
-            _textRuns.add(chpx);
+                if ( chpx != null )
+                    _textRuns.add( chpx );
            }
    }
        logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                Integer.valueOf( _textRuns.size() ), " elements)" );
+
+        if ( _textRuns.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "CHPX FKPs are empty" );
+            _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
+        }
    }

    public void rebuild( ComplexFileTable complexFileTable )
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
@ -18,6 +18,7 @@
 package org.apache.poi.hwpf.model;

 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;

 import org.apache.poi.hwpf.sprm.SprmBuffer;
@ -82,23 +83,30 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
            int bytesStartAt = getStart( x );
            int bytesEndAt = getEnd( x );

-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );

-            // TODO: CHECK!
-            // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
-            // ) );
-            CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
                        getGrpprl( x ), 0 ) );
                _chpxList.add( chpx );
            }
        }
+    }

    public CHPX getCHPX(int index)
    {
      return _chpxList.get(index);
    }

+    public List<CHPX> getCHPXs()
+    {
+        return Collections.unmodifiableList( _chpxList );
+    }
+
    public void fill(List<CHPX> filler)
    {
      _chpxList.addAll(filler);
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
@ -31,12 +31,16 @@ public interface CharIndexTranslator {
    int getByteIndex( int charPos );

    /**
-     * Calculates the char index of the given byte index.
-     * Look forward if index is not in table
+     * Calculates the char index of the given byte index. Look forward if index
+     * is not in table
     * 
-     * @param bytePos The character offset to check 
+     * @param bytePos
+     *            The character offset to check
     * @return the char index
+     * @deprecated This API were based on incorrect assumption that single byte
+     *             offset corresponds to single char offset
     */
+    @Deprecated
    int getCharIndex(int bytePos);

    /**
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
     * @param bytePos The character offset to check
     * @param startCP look from this characted position 
     * @return the char index
+     * @deprecated This API were based on incorrect assumption that single byte
+     *             offset corresponds to single char offset
     */
+    @Deprecated
    int getCharIndex(int bytePos, int startCP);

+    /**
+     * Finds character ranges that includes specified byte range.
+     * 
+     * @param startBytePosInclusive
+     *            start byte range
+     * @param endBytePosExclusive
+     *            end byte range
+     */
+    int[][] getCharIndexRanges( int startBytePosInclusive,
+            int endBytePosExclusive );
+
    /**
     * Check if index is in table
     * 
     * @param bytePos
     * @return true if index in table, false if not
     */
-
    boolean isIndexInTable(int bytePos);

    /**
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@ -92,12 +92,8 @@ public class PAPBinTable
                        documentStream, dataStream, pageOffset,
                        charIndexTranslator );

-                int fkpSize = pfkp.size();
-
-                for ( int y = 0; y < fkpSize; y++ )
+                for ( PAPX papx : pfkp.getPAPXs() )
                {
-                    PAPX papx = pfkp.getPAPX( y );
-
                    if ( papx != null )
                        _paragraphs.add( papx );
                }
@ -107,6 +103,12 @@ public class PAPBinTable
        logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
                Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                Integer.valueOf( _paragraphs.size() ), " elements)" );
+
+        if ( _paragraphs.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "PAPX FKPs are empty" );
+            _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
+        }
    }

    public void rebuild( final StringBuilder docText,
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
@ -23,6 +23,8 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;

+import org.apache.poi.hwpf.sprm.SprmBuffer;
+
 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
@ -88,13 +90,21 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
            int bytesStartAt = getStart( x );
            int bytesEndAt = getEnd( x );

-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );
+            // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+            // getParagraphHeight( x ), dataStream );
+            // _papxList.add( papx );

-            PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
                        getParagraphHeight( x ), dataStream );
                _papxList.add( papx );
            }
+        }
        _fkp = null;
    }

--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.LinkedList;
 import java.util.List;

 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
            System.arraycopy( documentStream, start, buf, 0, textSizeBytes );

            // And now build the piece
-            _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
-                    pieces[x] ) );
+            final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
+                    pieces[x] );
+
+            _textPieces.add( newTextPiece );
        }

        // In the interest of our sanity, now sort the text pieces
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
        return byteCount;
    }

+    @Deprecated
    public int getCharIndex( int bytePos )
    {
        return getCharIndex( bytePos, 0 );
    }

+    @Deprecated
    public int getCharIndex( int startBytePos, int startCP )
    {
        int charCount = 0;
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
        return charCount;
    }

+    public int[][] getCharIndexRanges( int startBytePosInclusive,
+            int endBytePosExclusive )
+    {
+        List<int[]> result = new LinkedList<int[]>();
+        for ( TextPiece textPiece : _textPiecesFCOrder )
+        {
+            final int tpStart = textPiece.getPieceDescriptor()
+                    .getFilePosition();
+            final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+                    + textPiece.bytesLength();
+            if ( startBytePosInclusive > tpEnd )
+                continue;
+            if ( endBytePosExclusive < tpStart )
+                break;
+
+            final int rangeStartBytes = Math.max( tpStart,
+                    startBytePosInclusive );
+            final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
+            final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
+
+            if ( rangeStartBytes > rangeEndBytes )
+                continue;
+
+            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+
+            final int rangeStartCp = textPiece.getStart()
+                    + ( rangeStartBytes - tpStart ) / encodingMultiplier;
+            final int rangeEndCp = rangeStartCp + rangeLengthBytes
+                    / encodingMultiplier;
+
+            result.add( new int[] { rangeStartCp, rangeEndCp } );
+        }
+
+        return result.toArray( new int[result.size()][] );
+    }
+
    public int getCpMin()
    {
        return _cpMin;
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator

    public int lookIndexForward( final int startBytePos )
    {
-        int bytePos = startBytePos;
-        for ( TextPiece tp : _textPiecesFCOrder )
-        {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
+        if ( _textPiecesFCOrder.isEmpty() )
+            throw new IllegalStateException( "Text pieces table is empty" );

-            if ( bytePos >= pieceStart + tp.bytesLength() )
-            {
-                continue;
-            }
+        if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
+            return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();

-            if ( pieceStart > bytePos )
-            {
-                bytePos = pieceStart;
-            }
+        if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
+                .getPieceDescriptor().getFilePosition() <= startBytePos )
+            return startBytePos;

-            break;
+        int low = 0;
+        int high = _textPiecesFCOrder.size() - 1;
+
+        while ( low <= high )
+        {
+            int mid = ( low + high ) >>> 1;
+            final TextPiece textPiece = _textPiecesFCOrder.get( mid );
+            int midVal = textPiece.getPieceDescriptor().getFilePosition();
+
+            if ( midVal < startBytePos )
+                low = mid + 1;
+            else if ( midVal > startBytePos )
+                high = mid - 1;
+            else
+                // found piece with exact start
+                return textPiece.getPieceDescriptor().getFilePosition();
        }
-        return bytePos;
+        assert low == high;
+        assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
+                .getFilePosition() < startBytePos;
+        // last line can't be current, can it?
+        assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
+                .getFilePosition() > startBytePos;
+
+        // shifting to next piece start
+        return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
    }

    public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@ -226,6 +226,36 @@ public class TestBugs extends TestCase
        assertEquals( extractor1.getText(), extractor2.getText() );
    }

+    /**
+     * Bug 44331 - HWPFDocument.write destroys fields
+     */
+    public void test44431_2()
+    {
+        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
+        WordExtractor extractor1 = new WordExtractor( doc1 );
+
+        assertEquals( "File name=FieldsTest.doc\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"STYLEREF test\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"TEST TABLE OF CONTENTS\n" + 
+        		"\n" + 
+        		"Heading paragraph in next page\t2\n" + 
+        		"Another heading paragraph in further page\t3\n" + 
+        		"Another heading paragraph in further page\t3\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"Heading paragraph in next page\n" + 
+        		"Another heading paragraph in further page\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"Page 3 of 3", extractor1.getText() );
+    }
+
    /**
     * Bug 45473 - HWPF cannot read file after save
     */
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
        hwpfDocument.write( new ByteArrayOutputStream() );
    }

-
-
    /**
-     * Bug 51678 - Extracting text from Bug51524.zip is slow
-     * Bug 51524 - PapBinTable constructor is slow
+     * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
+     * PapBinTable constructor is slow
     */
    public void test51678And51524()
    {
-        // YK: the test will run only if the poi.test.remote system property is set.
+        // YK: the test will run only if the poi.test.remote system property is
+        // set.
        // TODO: refactor into something nicer!
-        if(System.getProperty("poi.test.remote") != null) {
+        if ( System.getProperty( "poi.test.remote" ) != null )
+        {
            String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
-            HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+            HWPFDocument hwpfDocument = HWPFTestDataSamples
+                    .openRemoteFile( href );

            WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
            wordExtractor.getText();