fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;
Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
130a5fc04b
commit
d1aeec0d78
@ -34,6 +34,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.8-beta5" date="2011-??-??">
|
||||
<action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
|
||||
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
|
||||
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
|
||||
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
|
||||
|
@ -101,18 +101,21 @@ public class CHPBinTable
|
||||
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
|
||||
pageOffset, translator);
|
||||
|
||||
int fkpSize = cfkp.size();
|
||||
|
||||
for (int y = 0; y < fkpSize; y++)
|
||||
for ( CHPX chpx : cfkp.getCHPXs() )
|
||||
{
|
||||
final CHPX chpx = cfkp.getCHPX(y);
|
||||
if (chpx != null)
|
||||
_textRuns.add(chpx);
|
||||
if ( chpx != null )
|
||||
_textRuns.add( chpx );
|
||||
}
|
||||
}
|
||||
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( _textRuns.size() ), " elements)" );
|
||||
|
||||
if ( _textRuns.isEmpty() )
|
||||
{
|
||||
logger.log( POILogger.WARN, "CHPX FKPs are empty" );
|
||||
_textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
public void rebuild( ComplexFileTable complexFileTable )
|
||||
|
@ -18,6 +18,7 @@
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
@ -82,23 +83,30 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
|
||||
int bytesStartAt = getStart( x );
|
||||
int bytesEndAt = getEnd( x );
|
||||
|
||||
int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
|
||||
// int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
|
||||
// );
|
||||
|
||||
// TODO: CHECK!
|
||||
// CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
|
||||
// ) );
|
||||
CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
|
||||
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
|
||||
bytesEndAt ) )
|
||||
{
|
||||
CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
|
||||
getGrpprl( x ), 0 ) );
|
||||
_chpxList.add( chpx );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public CHPX getCHPX(int index)
|
||||
{
|
||||
return _chpxList.get(index);
|
||||
}
|
||||
|
||||
public List<CHPX> getCHPXs()
|
||||
{
|
||||
return Collections.unmodifiableList( _chpxList );
|
||||
}
|
||||
|
||||
public void fill(List<CHPX> filler)
|
||||
{
|
||||
_chpxList.addAll(filler);
|
||||
|
@ -31,12 +31,16 @@ public interface CharIndexTranslator {
|
||||
int getByteIndex( int charPos );
|
||||
|
||||
/**
|
||||
* Calculates the char index of the given byte index.
|
||||
* Look forward if index is not in table
|
||||
* Calculates the char index of the given byte index. Look forward if index
|
||||
* is not in table
|
||||
*
|
||||
* @param bytePos The character offset to check
|
||||
* @param bytePos
|
||||
* The character offset to check
|
||||
* @return the char index
|
||||
* @deprecated This API were based on incorrect assumption that single byte
|
||||
* offset corresponds to single char offset
|
||||
*/
|
||||
@Deprecated
|
||||
int getCharIndex(int bytePos);
|
||||
|
||||
/**
|
||||
@ -46,16 +50,29 @@ public interface CharIndexTranslator {
|
||||
* @param bytePos The character offset to check
|
||||
* @param startCP look from this characted position
|
||||
* @return the char index
|
||||
* @deprecated This API were based on incorrect assumption that single byte
|
||||
* offset corresponds to single char offset
|
||||
*/
|
||||
@Deprecated
|
||||
int getCharIndex(int bytePos, int startCP);
|
||||
|
||||
/**
|
||||
* Finds character ranges that includes specified byte range.
|
||||
*
|
||||
* @param startBytePosInclusive
|
||||
* start byte range
|
||||
* @param endBytePosExclusive
|
||||
* end byte range
|
||||
*/
|
||||
int[][] getCharIndexRanges( int startBytePosInclusive,
|
||||
int endBytePosExclusive );
|
||||
|
||||
/**
|
||||
* Check if index is in table
|
||||
*
|
||||
* @param bytePos
|
||||
* @return true if index in table, false if not
|
||||
*/
|
||||
|
||||
boolean isIndexInTable(int bytePos);
|
||||
|
||||
/**
|
||||
|
@ -92,12 +92,8 @@ public class PAPBinTable
|
||||
documentStream, dataStream, pageOffset,
|
||||
charIndexTranslator );
|
||||
|
||||
int fkpSize = pfkp.size();
|
||||
|
||||
for ( int y = 0; y < fkpSize; y++ )
|
||||
for ( PAPX papx : pfkp.getPAPXs() )
|
||||
{
|
||||
PAPX papx = pfkp.getPAPX( y );
|
||||
|
||||
if ( papx != null )
|
||||
_paragraphs.add( papx );
|
||||
}
|
||||
@ -107,6 +103,12 @@ public class PAPBinTable
|
||||
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
|
||||
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
|
||||
Integer.valueOf( _paragraphs.size() ), " elements)" );
|
||||
|
||||
if ( _paragraphs.isEmpty() )
|
||||
{
|
||||
logger.log( POILogger.WARN, "PAPX FKPs are empty" );
|
||||
_paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
public void rebuild( final StringBuilder docText,
|
||||
|
@ -23,6 +23,8 @@ import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
|
||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
@ -88,13 +90,21 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
|
||||
int bytesStartAt = getStart( x );
|
||||
int bytesEndAt = getEnd( x );
|
||||
|
||||
int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||
int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
|
||||
// int charStartAt = translator.getCharIndex( bytesStartAt );
|
||||
// int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
|
||||
// );
|
||||
// PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
|
||||
// getParagraphHeight( x ), dataStream );
|
||||
// _papxList.add( papx );
|
||||
|
||||
PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
|
||||
for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
|
||||
bytesEndAt ) )
|
||||
{
|
||||
PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
|
||||
getParagraphHeight( x ), dataStream );
|
||||
_papxList.add( papx );
|
||||
}
|
||||
}
|
||||
_fkp = null;
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||
@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
|
||||
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
|
||||
|
||||
// And now build the piece
|
||||
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
|
||||
pieces[x] ) );
|
||||
final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
|
||||
pieces[x] );
|
||||
|
||||
_textPieces.add( newTextPiece );
|
||||
}
|
||||
|
||||
// In the interest of our sanity, now sort the text pieces
|
||||
@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
|
||||
return byteCount;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public int getCharIndex( int bytePos )
|
||||
{
|
||||
return getCharIndex( bytePos, 0 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public int getCharIndex( int startBytePos, int startCP )
|
||||
{
|
||||
int charCount = 0;
|
||||
@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
|
||||
return charCount;
|
||||
}
|
||||
|
||||
public int[][] getCharIndexRanges( int startBytePosInclusive,
|
||||
int endBytePosExclusive )
|
||||
{
|
||||
List<int[]> result = new LinkedList<int[]>();
|
||||
for ( TextPiece textPiece : _textPiecesFCOrder )
|
||||
{
|
||||
final int tpStart = textPiece.getPieceDescriptor()
|
||||
.getFilePosition();
|
||||
final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
|
||||
+ textPiece.bytesLength();
|
||||
if ( startBytePosInclusive > tpEnd )
|
||||
continue;
|
||||
if ( endBytePosExclusive < tpStart )
|
||||
break;
|
||||
|
||||
final int rangeStartBytes = Math.max( tpStart,
|
||||
startBytePosInclusive );
|
||||
final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
|
||||
final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
|
||||
|
||||
if ( rangeStartBytes > rangeEndBytes )
|
||||
continue;
|
||||
|
||||
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
|
||||
|
||||
final int rangeStartCp = textPiece.getStart()
|
||||
+ ( rangeStartBytes - tpStart ) / encodingMultiplier;
|
||||
final int rangeEndCp = rangeStartCp + rangeLengthBytes
|
||||
/ encodingMultiplier;
|
||||
|
||||
result.add( new int[] { rangeStartCp, rangeEndCp } );
|
||||
}
|
||||
|
||||
return result.toArray( new int[result.size()][] );
|
||||
}
|
||||
|
||||
public int getCpMin()
|
||||
{
|
||||
return _cpMin;
|
||||
@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
|
||||
|
||||
public int lookIndexForward( final int startBytePos )
|
||||
{
|
||||
int bytePos = startBytePos;
|
||||
for ( TextPiece tp : _textPiecesFCOrder )
|
||||
{
|
||||
int pieceStart = tp.getPieceDescriptor().getFilePosition();
|
||||
if ( _textPiecesFCOrder.isEmpty() )
|
||||
throw new IllegalStateException( "Text pieces table is empty" );
|
||||
|
||||
if ( bytePos >= pieceStart + tp.bytesLength() )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
|
||||
return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
|
||||
|
||||
if ( pieceStart > bytePos )
|
||||
{
|
||||
bytePos = pieceStart;
|
||||
}
|
||||
if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
|
||||
.getPieceDescriptor().getFilePosition() <= startBytePos )
|
||||
return startBytePos;
|
||||
|
||||
break;
|
||||
int low = 0;
|
||||
int high = _textPiecesFCOrder.size() - 1;
|
||||
|
||||
while ( low <= high )
|
||||
{
|
||||
int mid = ( low + high ) >>> 1;
|
||||
final TextPiece textPiece = _textPiecesFCOrder.get( mid );
|
||||
int midVal = textPiece.getPieceDescriptor().getFilePosition();
|
||||
|
||||
if ( midVal < startBytePos )
|
||||
low = mid + 1;
|
||||
else if ( midVal > startBytePos )
|
||||
high = mid - 1;
|
||||
else
|
||||
// found piece with exact start
|
||||
return textPiece.getPieceDescriptor().getFilePosition();
|
||||
}
|
||||
return bytePos;
|
||||
assert low == high;
|
||||
assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
|
||||
.getFilePosition() < startBytePos;
|
||||
// last line can't be current, can it?
|
||||
assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
|
||||
.getFilePosition() > startBytePos;
|
||||
|
||||
// shifting to next piece start
|
||||
return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
|
||||
}
|
||||
|
||||
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
|
||||
|
@ -226,6 +226,36 @@ public class TestBugs extends TestCase
|
||||
assertEquals( extractor1.getText(), extractor2.getText() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug 44331 - HWPFDocument.write destroys fields
|
||||
*/
|
||||
public void test44431_2()
|
||||
{
|
||||
HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
|
||||
WordExtractor extractor1 = new WordExtractor( doc1 );
|
||||
|
||||
assertEquals( "File name=FieldsTest.doc\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"STYLEREF test\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"TEST TABLE OF CONTENTS\n" +
|
||||
"\n" +
|
||||
"Heading paragraph in next page\t2\n" +
|
||||
"Another heading paragraph in further page\t3\n" +
|
||||
"Another heading paragraph in further page\t3\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"Heading paragraph in next page\n" +
|
||||
"Another heading paragraph in further page\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"Page 3 of 3", extractor1.getText() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug 45473 - HWPF cannot read file after save
|
||||
*/
|
||||
@ -640,19 +670,20 @@ public class TestBugs extends TestCase
|
||||
hwpfDocument.write( new ByteArrayOutputStream() );
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Bug 51678 - Extracting text from Bug51524.zip is slow
|
||||
* Bug 51524 - PapBinTable constructor is slow
|
||||
* Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
|
||||
* PapBinTable constructor is slow
|
||||
*/
|
||||
public void test51678And51524()
|
||||
{
|
||||
// YK: the test will run only if the poi.test.remote system property is set.
|
||||
// YK: the test will run only if the poi.test.remote system property is
|
||||
// set.
|
||||
// TODO: refactor into something nicer!
|
||||
if(System.getProperty("poi.test.remote") != null) {
|
||||
if ( System.getProperty( "poi.test.remote" ) != null )
|
||||
{
|
||||
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
|
||||
HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
|
||||
HWPFDocument hwpfDocument = HWPFTestDataSamples
|
||||
.openRemoteFile( href );
|
||||
|
||||
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
|
||||
wordExtractor.getText();
|
||||
|
Loading…
Reference in New Issue
Block a user