fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;
Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
130a5fc04b
commit
d1aeec0d78
@ -34,6 +34,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.8-beta5" date="2011-??-??">
|
<release version="3.8-beta5" date="2011-??-??">
|
||||||
|
<action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
|
||||||
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
|
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
|
||||||
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
|
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
|
||||||
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
|
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
|
||||||
|
@ -101,11 +101,8 @@ public class CHPBinTable
|
|||||||
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
|
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
|
||||||
pageOffset, translator);
|
pageOffset, translator);
|
||||||
|
|
||||||
int fkpSize = cfkp.size();
|
for ( CHPX chpx : cfkp.getCHPXs() )
|
||||||
|
|
||||||
for (int y = 0; y < fkpSize; y++)
|
|
||||||
{
|
{
|
||||||
final CHPX chpx = cfkp.getCHPX(y);
|
|
||||||
if ( chpx != null )
|
if ( chpx != null )
|
||||||
_textRuns.add( chpx );
|
_textRuns.add( chpx );
|
||||||
}
|
}
|
||||||
@ -113,6 +110,12 @@ public class CHPBinTable
|
|||||||
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
|
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
|
||||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||||
Integer.valueOf( _textRuns.size() ), " elements)" );
|
Integer.valueOf( _textRuns.size() ), " elements)" );
|
||||||
|
|
||||||
|
if ( _textRuns.isEmpty() )
|
||||||
|
{
|
||||||
|
logger.log( POILogger.WARN, "CHPX FKPs are empty" );
|
||||||
|
_textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void rebuild( ComplexFileTable complexFileTable )
|
public void rebuild( ComplexFileTable complexFileTable )
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||||
@ -82,23 +83,30 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
|
|||||||
int bytesStartAt = getStart( x );
|
int bytesStartAt = getStart( x );
|
||||||
int bytesEndAt = getEnd( x );
|
int bytesEndAt = getEnd( x );
|
||||||
|
|
||||||
int charStartAt = translator.getCharIndex( bytesStartAt );
|
// int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||||
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
|
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
|
||||||
|
// );
|
||||||
|
|
||||||
// TODO: CHECK!
|
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
|
||||||
// CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
|
bytesEndAt ) )
|
||||||
// ) );
|
{
|
||||||
CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
|
CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
|
||||||
getGrpprl( x ), 0 ) );
|
getGrpprl( x ), 0 ) );
|
||||||
_chpxList.add( chpx );
|
_chpxList.add( chpx );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public CHPX getCHPX(int index)
|
public CHPX getCHPX(int index)
|
||||||
{
|
{
|
||||||
return _chpxList.get(index);
|
return _chpxList.get(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<CHPX> getCHPXs()
|
||||||
|
{
|
||||||
|
return Collections.unmodifiableList( _chpxList );
|
||||||
|
}
|
||||||
|
|
||||||
public void fill(List<CHPX> filler)
|
public void fill(List<CHPX> filler)
|
||||||
{
|
{
|
||||||
_chpxList.addAll(filler);
|
_chpxList.addAll(filler);
|
||||||
|
@ -31,12 +31,16 @@ public interface CharIndexTranslator {
|
|||||||
int getByteIndex( int charPos );
|
int getByteIndex( int charPos );
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates the char index of the given byte index.
|
* Calculates the char index of the given byte index. Look forward if index
|
||||||
* Look forward if index is not in table
|
* is not in table
|
||||||
*
|
*
|
||||||
* @param bytePos The character offset to check
|
* @param bytePos
|
||||||
|
* The character offset to check
|
||||||
* @return the char index
|
* @return the char index
|
||||||
|
* @deprecated This API were based on incorrect assumption that single byte
|
||||||
|
* offset corresponds to single char offset
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
int getCharIndex(int bytePos);
|
int getCharIndex(int bytePos);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
|
|||||||
* @param bytePos The character offset to check
|
* @param bytePos The character offset to check
|
||||||
* @param startCP look from this characted position
|
* @param startCP look from this characted position
|
||||||
* @return the char index
|
* @return the char index
|
||||||
|
* @deprecated This API were based on incorrect assumption that single byte
|
||||||
|
* offset corresponds to single char offset
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
int getCharIndex(int bytePos, int startCP);
|
int getCharIndex(int bytePos, int startCP);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds character ranges that includes specified byte range.
|
||||||
|
*
|
||||||
|
* @param startBytePosInclusive
|
||||||
|
* start byte range
|
||||||
|
* @param endBytePosExclusive
|
||||||
|
* end byte range
|
||||||
|
*/
|
||||||
|
int[][] getCharIndexRanges( int startBytePosInclusive,
|
||||||
|
int endBytePosExclusive );
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if index is in table
|
* Check if index is in table
|
||||||
*
|
*
|
||||||
* @param bytePos
|
* @param bytePos
|
||||||
* @return true if index in table, false if not
|
* @return true if index in table, false if not
|
||||||
*/
|
*/
|
||||||
|
|
||||||
boolean isIndexInTable(int bytePos);
|
boolean isIndexInTable(int bytePos);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -92,12 +92,8 @@ public class PAPBinTable
|
|||||||
documentStream, dataStream, pageOffset,
|
documentStream, dataStream, pageOffset,
|
||||||
charIndexTranslator );
|
charIndexTranslator );
|
||||||
|
|
||||||
int fkpSize = pfkp.size();
|
for ( PAPX papx : pfkp.getPAPXs() )
|
||||||
|
|
||||||
for ( int y = 0; y < fkpSize; y++ )
|
|
||||||
{
|
{
|
||||||
PAPX papx = pfkp.getPAPX( y );
|
|
||||||
|
|
||||||
if ( papx != null )
|
if ( papx != null )
|
||||||
_paragraphs.add( papx );
|
_paragraphs.add( papx );
|
||||||
}
|
}
|
||||||
@ -107,6 +103,12 @@ public class PAPBinTable
|
|||||||
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
|
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
|
||||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||||
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
||||||
|
|
||||||
|
if ( _paragraphs.isEmpty() )
|
||||||
|
{
|
||||||
|
logger.log( POILogger.WARN, "PAPX FKPs are empty" );
|
||||||
|
_paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void rebuild( final StringBuilder docText,
|
public void rebuild( final StringBuilder docText,
|
||||||
|
@ -23,6 +23,8 @@ import java.util.Arrays;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
@ -88,13 +90,21 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
|
|||||||
int bytesStartAt = getStart( x );
|
int bytesStartAt = getStart( x );
|
||||||
int bytesEndAt = getEnd( x );
|
int bytesEndAt = getEnd( x );
|
||||||
|
|
||||||
int charStartAt = translator.getCharIndex( bytesStartAt );
|
// int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||||
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
|
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
|
||||||
|
// );
|
||||||
|
// PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
|
||||||
|
// getParagraphHeight( x ), dataStream );
|
||||||
|
// _papxList.add( papx );
|
||||||
|
|
||||||
PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
|
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
|
||||||
|
bytesEndAt ) )
|
||||||
|
{
|
||||||
|
PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
|
||||||
getParagraphHeight( x ), dataStream );
|
getParagraphHeight( x ), dataStream );
|
||||||
_papxList.add( papx );
|
_papxList.add( papx );
|
||||||
}
|
}
|
||||||
|
}
|
||||||
_fkp = null;
|
_fkp = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||||
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
|
|||||||
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
|
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
|
||||||
|
|
||||||
// And now build the piece
|
// And now build the piece
|
||||||
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
|
final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
|
||||||
pieces[x] ) );
|
pieces[x] );
|
||||||
|
|
||||||
|
_textPieces.add( newTextPiece );
|
||||||
}
|
}
|
||||||
|
|
||||||
// In the interest of our sanity, now sort the text pieces
|
// In the interest of our sanity, now sort the text pieces
|
||||||
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
|
|||||||
return byteCount;
|
return byteCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public int getCharIndex( int bytePos )
|
public int getCharIndex( int bytePos )
|
||||||
{
|
{
|
||||||
return getCharIndex( bytePos, 0 );
|
return getCharIndex( bytePos, 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public int getCharIndex( int startBytePos, int startCP )
|
public int getCharIndex( int startBytePos, int startCP )
|
||||||
{
|
{
|
||||||
int charCount = 0;
|
int charCount = 0;
|
||||||
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
|
|||||||
return charCount;
|
return charCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int[][] getCharIndexRanges( int startBytePosInclusive,
|
||||||
|
int endBytePosExclusive )
|
||||||
|
{
|
||||||
|
List<int[]> result = new LinkedList<int[]>();
|
||||||
|
for ( TextPiece textPiece : _textPiecesFCOrder )
|
||||||
|
{
|
||||||
|
final int tpStart = textPiece.getPieceDescriptor()
|
||||||
|
.getFilePosition();
|
||||||
|
final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
|
||||||
|
+ textPiece.bytesLength();
|
||||||
|
if ( startBytePosInclusive > tpEnd )
|
||||||
|
continue;
|
||||||
|
if ( endBytePosExclusive < tpStart )
|
||||||
|
break;
|
||||||
|
|
||||||
|
final int rangeStartBytes = Math.max( tpStart,
|
||||||
|
startBytePosInclusive );
|
||||||
|
final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
|
||||||
|
final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
|
||||||
|
|
||||||
|
if ( rangeStartBytes > rangeEndBytes )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
|
||||||
|
|
||||||
|
final int rangeStartCp = textPiece.getStart()
|
||||||
|
+ ( rangeStartBytes - tpStart ) / encodingMultiplier;
|
||||||
|
final int rangeEndCp = rangeStartCp + rangeLengthBytes
|
||||||
|
/ encodingMultiplier;
|
||||||
|
|
||||||
|
result.add( new int[] { rangeStartCp, rangeEndCp } );
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.toArray( new int[result.size()][] );
|
||||||
|
}
|
||||||
|
|
||||||
public int getCpMin()
|
public int getCpMin()
|
||||||
{
|
{
|
||||||
return _cpMin;
|
return _cpMin;
|
||||||
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
|
|||||||
|
|
||||||
public int lookIndexForward( final int startBytePos )
|
public int lookIndexForward( final int startBytePos )
|
||||||
{
|
{
|
||||||
int bytePos = startBytePos;
|
if ( _textPiecesFCOrder.isEmpty() )
|
||||||
for ( TextPiece tp : _textPiecesFCOrder )
|
throw new IllegalStateException( "Text pieces table is empty" );
|
||||||
{
|
|
||||||
int pieceStart = tp.getPieceDescriptor().getFilePosition();
|
|
||||||
|
|
||||||
if ( bytePos >= pieceStart + tp.bytesLength() )
|
if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
|
||||||
{
|
return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( pieceStart > bytePos )
|
if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
|
||||||
{
|
.getPieceDescriptor().getFilePosition() <= startBytePos )
|
||||||
bytePos = pieceStart;
|
return startBytePos;
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
int low = 0;
|
||||||
|
int high = _textPiecesFCOrder.size() - 1;
|
||||||
|
|
||||||
|
while ( low <= high )
|
||||||
|
{
|
||||||
|
int mid = ( low + high ) >>> 1;
|
||||||
|
final TextPiece textPiece = _textPiecesFCOrder.get( mid );
|
||||||
|
int midVal = textPiece.getPieceDescriptor().getFilePosition();
|
||||||
|
|
||||||
|
if ( midVal < startBytePos )
|
||||||
|
low = mid + 1;
|
||||||
|
else if ( midVal > startBytePos )
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
// found piece with exact start
|
||||||
|
return textPiece.getPieceDescriptor().getFilePosition();
|
||||||
}
|
}
|
||||||
return bytePos;
|
assert low == high;
|
||||||
|
assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
|
||||||
|
.getFilePosition() < startBytePos;
|
||||||
|
// last line can't be current, can it?
|
||||||
|
assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
|
||||||
|
.getFilePosition() > startBytePos;
|
||||||
|
|
||||||
|
// shifting to next piece start
|
||||||
|
return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
|
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
|
||||||
|
@ -226,6 +226,36 @@ public class TestBugs extends TestCase
|
|||||||
assertEquals( extractor1.getText(), extractor2.getText() );
|
assertEquals( extractor1.getText(), extractor2.getText() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bug 44331 - HWPFDocument.write destroys fields
|
||||||
|
*/
|
||||||
|
public void test44431_2()
|
||||||
|
{
|
||||||
|
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
|
||||||
|
WordExtractor extractor1 = new WordExtractor( doc1 );
|
||||||
|
|
||||||
|
assertEquals( "File name=FieldsTest.doc\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"STYLEREF test\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"TEST TABLE OF CONTENTS\n" +
|
||||||
|
"\n" +
|
||||||
|
"Heading paragraph in next page\t2\n" +
|
||||||
|
"Another heading paragraph in further page\t3\n" +
|
||||||
|
"Another heading paragraph in further page\t3\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"Heading paragraph in next page\n" +
|
||||||
|
"Another heading paragraph in further page\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"\n" +
|
||||||
|
"Page 3 of 3", extractor1.getText() );
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bug 45473 - HWPF cannot read file after save
|
* Bug 45473 - HWPF cannot read file after save
|
||||||
*/
|
*/
|
||||||
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
|
|||||||
hwpfDocument.write( new ByteArrayOutputStream() );
|
hwpfDocument.write( new ByteArrayOutputStream() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bug 51678 - Extracting text from Bug51524.zip is slow
|
* Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
|
||||||
* Bug 51524 - PapBinTable constructor is slow
|
* PapBinTable constructor is slow
|
||||||
*/
|
*/
|
||||||
public void test51678And51524()
|
public void test51678And51524()
|
||||||
{
|
{
|
||||||
// YK: the test will run only if the poi.test.remote system property is set.
|
// YK: the test will run only if the poi.test.remote system property is
|
||||||
|
// set.
|
||||||
// TODO: refactor into something nicer!
|
// TODO: refactor into something nicer!
|
||||||
if(System.getProperty("poi.test.remote") != null) {
|
if ( System.getProperty( "poi.test.remote" ) != null )
|
||||||
|
{
|
||||||
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
|
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
|
||||||
HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
|
HWPFDocument hwpfDocument = HWPFTestDataSamples
|
||||||
|
.openRemoteFile( href );
|
||||||
|
|
||||||
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
|
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
|
||||||
wordExtractor.getText();
|
wordExtractor.getText();
|
||||||
|
Loading…
Reference in New Issue
Block a user