push boundaries checks down, removing deprecation warnings, remove (unused) cpMin (Word XP) hack

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1145075 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-11 08:57:42 +00:00
parent 55c6850928
commit 26c1fa750d
10 changed files with 82 additions and 57 deletions

View File

@ -214,15 +214,10 @@ public final class HWPFDocument extends HWPFDocumentCore
_cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
_tpt = _cft.getTextPieceTable(); _tpt = _cft.getTextPieceTable();
// Word XP and later all put in a zero filled buffer in
// front of the text. This screws up the system for offsets,
// which assume we always start at zero. This is an adjustment.
int cpMin = _tpt.getCpMin();
// Now load the rest of the properties, which need to be adjusted // Now load the rest of the properties, which need to be adjusted
// for where text really begin // for where text really begin
_cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt, true); _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt, true);
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), cpMin, _tpt, true); _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt, true);
// Read FSPA and Escher information // Read FSPA and Escher information
_fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());

View File

@ -43,6 +43,8 @@ public class HtmlDocumentFacade
html.appendChild( head ); html.appendChild( head );
html.appendChild( body ); html.appendChild( body );
body.setAttribute( "style", "white-space-collapsing: preserve; " );
} }
public void addAuthor( String value ) public void addAuthor( String value )

View File

@ -172,7 +172,8 @@ public class WordToHtmlUtils extends AbstractWordUtils
style.append( "break-before: page; " ); style.append( "break-before: page; " );
} }
style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " ); style.append( "hyphenate: "
+ ( paragraph.isAutoHyphenated() ? "auto" : "none" ) + "; " );
if ( paragraph.keepOnPage() ) if ( paragraph.keepOnPage() )
{ {
@ -183,9 +184,6 @@ public class WordToHtmlUtils extends AbstractWordUtils
{ {
style.append( "keep-with-next.within-page: always; " ); style.append( "keep-with-next.within-page: always; " );
} }
style.append( "linefeed-treatment: preserve; " );
style.append( "white-space-collapse: false; " );
} }
public static void addTableCellProperties( TableRow tableRow, public static void addTableCellProperties( TableRow tableRow,

View File

@ -21,6 +21,8 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* Represents a CHP fkp. The style properties for paragraph and character runs * Represents a CHP fkp. The style properties for paragraph and character runs
@ -40,6 +42,9 @@ import org.apache.poi.util.LittleEndian;
*/ */
public final class CHPFormattedDiskPage extends FormattedDiskPage public final class CHPFormattedDiskPage extends FormattedDiskPage
{ {
private static final POILogger logger = POILogFactory
.getLogger( CHPFormattedDiskPage.class );
private static final int FC_SIZE = 4; private static final int FC_SIZE = 4;
private ArrayList<CHPX> _chpxList = new ArrayList<CHPX>(); private ArrayList<CHPX> _chpxList = new ArrayList<CHPX>();
@ -79,11 +84,20 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
int startAt = getStart(x); int startAt = getStart(x);
int endAt = getEnd(x); int endAt = getEnd(x);
if (ignoreChpxWithoutTextPieces && !tpt.isIndexInTable( startAt, endAt ) ) { if (!ignoreChpxWithoutTextPieces || tpt.isIndexInTable( startAt, endAt ) )
_chpxList.add(null); {
} else {
_chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x))); _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x)));
} }
else
{
logger.log( POILogger.WARN, "CHPX [",
Integer.valueOf( startAt ), "; ",
Integer.valueOf( endAt ),
") (bytes) doesn't have corresponding text pieces "
+ "and will be skipped" );
_chpxList.add(null);
}
} }
} }

View File

@ -21,8 +21,6 @@ import java.util.Collections;
import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* This class holds all of the character formatting * This class holds all of the character formatting
@ -34,9 +32,6 @@ import org.apache.poi.util.POILogger;
*/ */
public final class OldCHPBinTable extends CHPBinTable public final class OldCHPBinTable extends CHPBinTable
{ {
private static final POILogger logger = POILogFactory
.getLogger( OldCHPBinTable.class );
/** /**
* Constructor used to read an old-style binTable * Constructor used to read an old-style binTable
* in from a Word document. * in from a Word document.
@ -67,15 +62,8 @@ public final class OldCHPBinTable extends CHPBinTable
for (int y = 0; y < fkpSize; y++) for (int y = 0; y < fkpSize; y++)
{ {
CHPX chpx = cfkp.getCHPX(y); CHPX chpx = cfkp.getCHPX(y);
if (chpx != null && tpt.isIndexInTable( chpx.getStartBytes(), chpx.getEndBytes() )) {
_textRuns.add(chpx);
} else {
if (chpx != null) if (chpx != null)
logger.log( POILogger.WARN, "CHPX [", _textRuns.add(chpx);
chpx.getStartBytes(), "; ", chpx.getEndBytes(),
") (bytes) doesn't have corresponding text pieces "
+ "and will be skipped" );
}
} }
} }
Collections.sort( _textRuns, PropertyNode.StartComparator.instance ); Collections.sort( _textRuns, PropertyNode.StartComparator.instance );

View File

@ -21,8 +21,6 @@ import java.util.Collections;
import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* This class holds all of the paragraph formatting * This class holds all of the paragraph formatting
@ -34,8 +32,6 @@ import org.apache.poi.util.POILogger;
*/ */
public final class OldPAPBinTable extends PAPBinTable public final class OldPAPBinTable extends PAPBinTable
{ {
private static final POILogger logger = POILogFactory
.getLogger( OldPAPBinTable.class );
public OldPAPBinTable(byte[] documentStream, int offset, public OldPAPBinTable(byte[] documentStream, int offset,
int size, int fcMin, TextPieceTable tpt) int size, int fcMin, TextPieceTable tpt)
@ -51,21 +47,15 @@ public final class OldPAPBinTable extends PAPBinTable
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
documentStream, pageOffset, fcMin, tpt); documentStream, pageOffset, tpt, true);
int fkpSize = pfkp.size(); int fkpSize = pfkp.size();
for (int y = 0; y < fkpSize; y++) for (int y = 0; y < fkpSize; y++)
{ {
PAPX papx = pfkp.getPAPX(y); PAPX papx = pfkp.getPAPX(y);
if (papx != null && tpt.isIndexInTable( papx.getStartBytes(), papx.getEndBytes() )) { if (papx != null) {
_paragraphs.add(papx); _paragraphs.add(papx);
} else {
if ( papx != null )
logger.log( POILogger.WARN, "PAPX [",
papx.getStartBytes(), "; ", papx.getEndBytes(),
") (bytes) doesn't have corresponding text pieces "
+ "and will be skipped" );
} }
} }
} }

View File

@ -56,13 +56,12 @@ public class PAPBinTable
byte[] dataStream, int offset, int size, int fcMin, byte[] dataStream, int offset, int size, int fcMin,
TextPieceTable tpt ) TextPieceTable tpt )
{ {
this( documentStream, tableStream, dataStream, offset, size, fcMin, this( documentStream, tableStream, dataStream, offset, size, tpt, true );
tpt, true );
} }
public PAPBinTable( byte[] documentStream, byte[] tableStream, public PAPBinTable( byte[] documentStream, byte[] tableStream,
byte[] dataStream, int offset, int size, int fcMin, byte[] dataStream, int offset, int size, TextPieceTable tpt,
TextPieceTable tpt, boolean ignorePapxWithoutTextPieces ) boolean ignorePapxWithoutTextPieces )
{ {
PlexOfCps binTable = new PlexOfCps(tableStream, offset, size, 4); PlexOfCps binTable = new PlexOfCps(tableStream, offset, size, 4);
this.tpt = tpt; this.tpt = tpt;
@ -76,7 +75,7 @@ public class PAPBinTable
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
dataStream, pageOffset, fcMin, tpt); dataStream, pageOffset, tpt, ignorePapxWithoutTextPieces);
int fkpSize = pfkp.size(); int fkpSize = pfkp.size();
@ -84,8 +83,7 @@ public class PAPBinTable
{ {
PAPX papx = pfkp.getPAPX(y); PAPX papx = pfkp.getPAPX(y);
//we don't need PAPX if they are references nowhere if (papx != null)
if (!ignorePapxWithoutTextPieces || tpt.isIndexInTable( papx.getStartBytes(), papx.getEndBytes() ))
_paragraphs.add(papx); _paragraphs.add(papx);
} }
} }

View File

@ -22,6 +22,8 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* Represents a PAP FKP. The style properties for paragraph and character runs * Represents a PAP FKP. The style properties for paragraph and character runs
@ -40,6 +42,8 @@ import org.apache.poi.util.LittleEndian;
* @author Ryan Ackley * @author Ryan Ackley
*/ */
public final class PAPFormattedDiskPage extends FormattedDiskPage { public final class PAPFormattedDiskPage extends FormattedDiskPage {
private static final POILogger logger = POILogFactory
.getLogger( PAPFormattedDiskPage.class );
private static final int BX_SIZE = 13; private static final int BX_SIZE = 13;
private static final int FC_SIZE = 4; private static final int FC_SIZE = 4;
@ -56,14 +60,42 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
/** /**
* Creates a PAPFormattedDiskPage from a 512 byte array * Creates a PAPFormattedDiskPage from a 512 byte array
*
* @deprecated Use
* {@link #PAPFormattedDiskPage(byte[],byte[],int,int,TextPieceTable,boolean)}
* instead
*/ */
public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt) public PAPFormattedDiskPage( byte[] documentStream, byte[] dataStream,
int offset, int fcMin, TextPieceTable tpt )
{
this( documentStream, dataStream, offset, tpt, true );
}
/**
* Creates a PAPFormattedDiskPage from a 512 byte array
*/
public PAPFormattedDiskPage( byte[] documentStream, byte[] dataStream,
int offset, TextPieceTable tpt, boolean ignorePapxWithoutTextPieces )
{ {
super( documentStream, offset ); super( documentStream, offset );
for (int x = 0; x < _crun; x++) { for ( int x = 0; x < _crun; x++ )
{
int startAt = getStart( x ); int startAt = getStart( x );
int endAt = getEnd( x ); int endAt = getEnd( x );
_papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream)); if ( !ignorePapxWithoutTextPieces
|| tpt.isIndexInTable( startAt, endAt ) )
_papxList.add( new PAPX( startAt, endAt, tpt, getGrpprl( x ),
getParagraphHeight( x ), dataStream ) );
else
{
logger.log( POILogger.WARN, "PAPX [",
Integer.valueOf( startAt ), "; ",
Integer.valueOf( endAt ),
") (bytes) doesn't have corresponding text pieces "
+ "and will be skipped" );
_papxList.add( null );
}
} }
_fkp = null; _fkp = null;
_dataStream = dataStream; _dataStream = dataStream;

View File

@ -26,6 +26,7 @@ import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
@ -68,6 +69,13 @@ public class TestWordToHtmlConverter extends TestCase
assertTrue( result.substring( 0, 2000 ).contains( "<table>" ) ); assertTrue( result.substring( 0, 2000 ).contains( "<table>" ) );
} }
public void testBug33519() throws Exception
{
String result = getHtmlText( "Bug33519.doc" );
assertTrue( result.contains( "Планински турове" ) );
assertTrue( result.contains( "Явор Асенов" ) );
}
public void testBug46610_2() throws Exception public void testBug46610_2() throws Exception
{ {
String result = getHtmlText( "Bug46610_2.doc" ); String result = getHtmlText( "Bug46610_2.doc" );

View File

@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocFixture; import org.apache.poi.hwpf.HWPFDocFixture;
import org.apache.poi.hwpf.model.io.HWPFFileSystem; import org.apache.poi.hwpf.model.io.HWPFFileSystem;
@ -38,9 +39,8 @@ public final class TestPAPBinTable
FileInformationBlock fib = _hWPFDocFixture._fib; FileInformationBlock fib = _hWPFDocFixture._fib;
byte[] mainStream = _hWPFDocFixture._mainStream; byte[] mainStream = _hWPFDocFixture._mainStream;
byte[] tableStream = _hWPFDocFixture._tableStream; byte[] tableStream = _hWPFDocFixture._tableStream;
int fcMin = fib.getFcMin();
_pAPBinTable = new PAPBinTable(mainStream, tableStream, null, fib.getFcPlcfbtePapx(), fib.getLcbPlcfbtePapx(), fcMin, fakeTPT, false); _pAPBinTable = new PAPBinTable(mainStream, tableStream, null, fib.getFcPlcfbtePapx(), fib.getLcbPlcfbtePapx(), fakeTPT, false);
HWPFFileSystem fileSys = new HWPFFileSystem(); HWPFFileSystem fileSys = new HWPFFileSystem();
@ -51,7 +51,7 @@ public final class TestPAPBinTable
byte[] newTableStream = tableOut.toByteArray(); byte[] newTableStream = tableOut.toByteArray();
byte[] newMainStream = mainOut.toByteArray(); byte[] newMainStream = mainOut.toByteArray();
PAPBinTable newBinTable = new PAPBinTable(newMainStream, newTableStream, null,0, newTableStream.length, 0, fakeTPT, false); PAPBinTable newBinTable = new PAPBinTable(newMainStream, newTableStream, null,0, newTableStream.length, fakeTPT, false);
ArrayList oldTextRuns = _pAPBinTable.getParagraphs(); ArrayList oldTextRuns = _pAPBinTable.getParagraphs();
ArrayList newTextRuns = newBinTable.getParagraphs(); ArrayList newTextRuns = newBinTable.getParagraphs();