rewrite table bounds detection for Word 97, including inner table support

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1143070 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-05 14:13:27 +00:00
parent fa46737e44
commit 8ac3172759
9 changed files with 240 additions and 175 deletions

View File

@ -241,6 +241,14 @@ public abstract class AbstractWordConverter
if ( paragraph.isInTable()
&& paragraph.getTableLevel() != currentTableLevel )
{
if ( paragraph.getTableLevel() < currentTableLevel )
throw new IllegalStateException(
"Trying to process table cell with higher level ("
+ paragraph.getTableLevel()
+ ") than current table level ("
+ currentTableLevel
+ ") as inner table part" );
Table table = range.getTable( paragraph );
processTable( wordDocument, flow, table );

View File

@ -84,9 +84,9 @@ public class Paragraph extends Range implements Cloneable {
protected ParagraphProperties _props;
protected SprmBuffer _papx;
protected Paragraph(int startIdx, int endIdx, Table parent)
protected Paragraph(int startIdxInclusive, int endIdxExclusive, Table parent)
{
super(startIdx, endIdx, Range.TYPE_PARAGRAPH, parent);
super(startIdxInclusive, endIdxExclusive, Range.TYPE_PARAGRAPH, parent);
PAPX papx = _paragraphs.get(_parEnd - 1);
_props = papx.getParagraphProperties(_doc.getStyleSheet());
_papx = papx.getSprmBuf();

View File

@ -90,10 +90,10 @@ public class Range { // TODO -instantiable superclass
/** All paragraphs that belong to the document this Range belongs to. */
protected List<PAPX> _paragraphs;
/** The start index in the paragraphs list for this Range */
/** The start index in the paragraphs list for this Range, inclusive */
protected int _parStart;
/** The end index in the paragraphs list for this Range. */
/** The end index in the paragraphs list for this Range, exclusive */
protected int _parEnd;
/** Have we loaded the characterRun indexes yet. */
@ -178,9 +178,9 @@ public class Range { // TODO -instantiable superclass
* lists.
*
* @param startIdx
* The starting index in the list.
* The starting index in the list, inclusive
* @param endIdx
* The ending index in the list.
* The ending index in the list, exclusive
* @param idxType
* The list type.
* @param parent
@ -199,27 +199,27 @@ public class Range { // TODO -instantiable superclass
_parStart = parent._parStart + startIdx;
_parEnd = parent._parStart + endIdx;
_start = _paragraphs.get(_parStart).getStart();
_end = _paragraphs.get(_parEnd).getEnd();
_end = _paragraphs.get(_parEnd - 1).getEnd();
_parRangeFound = true;
break;
case TYPE_CHARACTER:
_charStart = parent._charStart + startIdx;
_charEnd = parent._charStart + endIdx;
_start = _characters.get(_charStart).getStart();
_start = _characters.get(_charStart - 1).getStart();
_end = _characters.get(_charEnd).getEnd();
_charRangeFound = true;
break;
case TYPE_SECTION:
_sectionStart = parent._sectionStart + startIdx;
_sectionEnd = parent._sectionStart + endIdx;
_start = _sections.get(_sectionStart).getStart();
_start = _sections.get(_sectionStart - 1).getStart();
_end = _sections.get(_sectionEnd).getEnd();
_sectionRangeFound = true;
break;
case TYPE_TEXT:
_textStart = parent._textStart + startIdx;
_textEnd = parent._textStart + endIdx;
_start = _text.get(_textStart).getStart();
_start = _text.get(_textStart - 1).getStart();
_end = _text.get(_textEnd).getEnd();
_textRangeFound = true;
break;
@ -832,7 +832,13 @@ public class Range { // TODO -instantiable superclass
*/
public Paragraph getParagraph(int index) {
initParagraphs();
initParagraphs();
if ( index + _parStart >= _parEnd )
throw new IndexOutOfBoundsException( "Paragraph #" + index + " ("
+ (index + _parStart) + ") not in range [" + _parStart
+ "; " + _parEnd + ")" );
PAPX papx = _paragraphs.get(index + _parStart);
ParagraphProperties props = papx.getParagraphProperties(_doc.getStyleSheet());
@ -880,7 +886,7 @@ public class Range { // TODO -instantiable superclass
r.initAll();
int tableLevel = paragraph.getTableLevel();
int tableEnd = r._parEnd;
int tableEndInclusive = r._parEnd ;
if ( r._parStart != 0 )
{
@ -895,25 +901,31 @@ public class Range { // TODO -instantiable superclass
}
}
final Range overallrange = getDocument() instanceof HWPFDocument ? ((HWPFDocument) getDocument())
.getOverallRange() : getDocument().getRange();
int limit = _paragraphs.size();
for ( ; tableEnd < limit; tableEnd++ )
for ( ; tableEndInclusive < limit - 1; tableEndInclusive++ )
{
Paragraph next = new Paragraph( _paragraphs.get( tableEnd ), this );
Paragraph next = new Paragraph( _paragraphs.get( tableEndInclusive + 1 ),
overallrange );
if ( !next.isInTable() || next.getTableLevel() < tableLevel )
break;
}
initAll();
if (tableEnd > _parEnd) {
throw new ArrayIndexOutOfBoundsException(
"The table's bounds fall outside of this Range");
}
if (tableEnd < 0) {
throw new ArrayIndexOutOfBoundsException(
"The table's end is negative, which isn't allowed!");
}
return new Table(r._parStart, tableEnd, r._doc.getRange(), paragraph.getTableLevel());
}
initAll();
if ( tableEndInclusive + 1 > _parEnd )
{
throw new ArrayIndexOutOfBoundsException(
"The table's bounds fall outside of this Range" );
}
if ( tableEndInclusive < 0 )
{
throw new ArrayIndexOutOfBoundsException(
"The table's end is negative, which isn't allowed!" );
}
return new Table( r._parStart, tableEndInclusive + 1, r._doc.getRange(),
paragraph.getTableLevel() );
}
/**
* loads all of the list indexes.
@ -989,7 +1001,11 @@ public class Range { // TODO -instantiable superclass
*/
private int[] findRange(List<? extends PropertyNode> rpl, int min, int start, int end) {
int x = min;
PropertyNode node = rpl.get(x);
if ( rpl.size() == min )
return new int[] { min, min };
PropertyNode node = rpl.get( x );
while (node==null || (node.getEnd() <= start && x < rpl.size() - 1)) {
x++;

View File

@ -25,9 +25,9 @@ public final class Table extends Range
private int _tableLevel;
Table( int startIdx, int endIdx, Range parent, int levelNum )
Table( int startIdxInclusive, int endIdxExclusive, Range parent, int levelNum )
{
super( startIdx, endIdx, Range.TYPE_PARAGRAPH, parent );
super( startIdxInclusive, endIdxExclusive, Range.TYPE_PARAGRAPH, parent );
_rows = new ArrayList<TableRow>();
_tableLevel = levelNum;
@ -41,8 +41,8 @@ public final class Table extends Range
rowEnd++;
if ( p.isTableRowEnd() && p.getTableLevel() == levelNum )
{
_rows.add( new TableRow( rowStart, rowEnd, this, levelNum ) );
rowStart = rowEnd;
_rows.add( new TableRow( rowStart, rowEnd + 1, this, levelNum ) );
rowStart = rowEnd + 1;
}
}
}

View File

@ -25,14 +25,16 @@ public final class TableCell
private int _leftEdge;
private int _width;
public TableCell(int startIdx, int endIdx, TableRow parent, int levelNum, TableCellDescriptor tcd, int leftEdge, int width)
{
super(startIdx, endIdx, Range.TYPE_PARAGRAPH, parent);
_tcd = tcd;
_leftEdge = leftEdge;
_width = width;
_levelNum = levelNum;
}
public TableCell( int startIdxInclusive, int endIdxExclusive,
TableRow parent, int levelNum, TableCellDescriptor tcd,
int leftEdge, int width )
{
super( startIdxInclusive, endIdxExclusive, Range.TYPE_PARAGRAPH, parent );
_tcd = tcd;
_leftEdge = leftEdge;
_width = width;
_levelNum = levelNum;
}
public boolean isFirstMerged()
{

View File

@ -17,155 +17,198 @@
package org.apache.poi.hwpf.usermodel;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.sprm.TableSprmUncompressor;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
public final class TableRow
extends Paragraph
public final class TableRow extends Paragraph
{
private final static char TABLE_CELL_MARK = '\u0007';
private final static char TABLE_CELL_MARK = '\u0007';
private final static short SPRM_TJC = 0x5400;
private final static short SPRM_DXAGAPHALF = (short)0x9602;
private final static short SPRM_FCANTSPLIT = 0x3403;
private final static short SPRM_FTABLEHEADER = 0x3404;
private final static short SPRM_DYAROWHEIGHT = (short)0x9407;
private final static short SPRM_TJC = 0x5400;
private final static short SPRM_DXAGAPHALF = (short) 0x9602;
private final static short SPRM_FCANTSPLIT = 0x3403;
private final static short SPRM_FTABLEHEADER = 0x3404;
private final static short SPRM_DYAROWHEIGHT = (short) 0x9407;
int _levelNum;
private TableProperties _tprops;
private TableCell[] _cells;
private static final POILogger logger = POILogFactory
.getLogger( TableRow.class );
public TableRow(int startIdx, int endIdx, Table parent, int levelNum)
{
super(startIdx, endIdx, parent);
int _levelNum;
private TableProperties _tprops;
private TableCell[] _cells;
_tprops = TableSprmUncompressor.uncompressTAP(_papx.toByteArray(), 2);
_levelNum = levelNum;
_cells = new TableCell[_tprops.getItcMac()];
int start = 0;
int end = 0;
for (int cellIndex = 0; cellIndex < _cells.length; cellIndex++)
public TableRow( int startIdxInclusive, int endIdxExclusive, Table parent,
int levelNum )
{
Paragraph p = getParagraph(start);
String s = p.text();
super( startIdxInclusive, endIdxExclusive, parent );
while (! ( s.length() > 0 && (s.charAt(s.length() - 1) == TABLE_CELL_MARK) ||
p.isEmbeddedCellMark() && p.getTableLevel() == levelNum) && end < endIdx)
{
end++;
p = getParagraph(end);
s = p.text();
}
_tprops = TableSprmUncompressor.uncompressTAP( _papx.toByteArray(), 2 );
_levelNum = levelNum;
final short expectedCellsCount = _tprops.getItcMac();
// Create it for the correct paragraph range
_cells[cellIndex] = new TableCell(start, end, this, levelNum,
_tprops.getRgtc()[cellIndex],
_tprops.getRgdxaCenter()[cellIndex],
_tprops.getRgdxaCenter()[cellIndex+1]-_tprops.getRgdxaCenter()[cellIndex]);
// Now we've decided where everything is, tweak the
// record of the paragraph end so that the
// paragraph level counts work
// This is a bit hacky, we really need a better fix...
_cells[cellIndex]._parEnd++;
int lastCellStart = 0;
List<TableCell> cells = new ArrayList<TableCell>( expectedCellsCount );
for ( int p = 0; p < (endIdxExclusive - startIdxInclusive); p++ )
{
Paragraph paragraph = getParagraph( p );
String s = paragraph.text();
// Next!
end++;
start = end;
if ( s.length() > 0
&& s.charAt( s.length() - 1 ) == TABLE_CELL_MARK
&& paragraph.getTableLevel() == levelNum )
{
TableCellDescriptor tableCellDescriptor = _tprops.getRgtc() != null
&& _tprops.getRgtc().length > cells.size() ? _tprops
.getRgtc()[cells.size()] : new TableCellDescriptor();
final short leftEdge = _tprops.getRgdxaCenter() != null
&& _tprops.getRgdxaCenter().length > cells.size() ? _tprops
.getRgdxaCenter()[cells.size()] : 0;
final short rightEdge = _tprops.getRgdxaCenter() != null
&& _tprops.getRgdxaCenter().length > cells.size() + 1 ? _tprops
.getRgdxaCenter()[cells.size() + 1] : 0;
TableCell tableCell = new TableCell( lastCellStart, p + 1,
this, levelNum, tableCellDescriptor, leftEdge,
rightEdge - leftEdge );
cells.add( tableCell );
lastCellStart = p + 1;
}
}
if ( lastCellStart < (endIdxExclusive - startIdxInclusive - 1) )
{
TableCellDescriptor tableCellDescriptor = _tprops.getRgtc() != null
&& _tprops.getRgtc().length > cells.size() ? _tprops
.getRgtc()[cells.size()] : new TableCellDescriptor();
final short leftEdge = _tprops.getRgdxaCenter() != null
&& _tprops.getRgdxaCenter().length > cells.size() ? _tprops
.getRgdxaCenter()[cells.size()] : 0;
final short rightEdge = _tprops.getRgdxaCenter() != null
&& _tprops.getRgdxaCenter().length > cells.size() + 1 ? _tprops
.getRgdxaCenter()[cells.size() + 1] : 0;
TableCell tableCell = new TableCell( lastCellStart,
(endIdxExclusive - startIdxInclusive - 1), this, levelNum,
tableCellDescriptor, leftEdge, rightEdge - leftEdge );
cells.add( tableCell );
}
if ( cells.size() != expectedCellsCount )
{
logger.log( POILogger.WARN,
"Number of found table cells (" + cells.size()
+ ") for table row [" + getStartOffset() + "c; "
+ getEndOffset()
+ "c] not equals to stored property value "
+ expectedCellsCount );
_tprops.setItcMac( (short) cells.size() );
}
_cells = cells.toArray( new TableCell[cells.size()] );
}
}
public int getRowJustification()
{
return _tprops.getJc();
}
public int getRowJustification()
{
return _tprops.getJc();
}
public void setRowJustification(int jc)
{
_tprops.setJc((short) jc);
_papx.updateSprm(SPRM_TJC, (short)jc);
}
public void setRowJustification( int jc )
{
_tprops.setJc( (short) jc );
_papx.updateSprm( SPRM_TJC, (short) jc );
}
public int getGapHalf()
{
return _tprops.getDxaGapHalf();
}
public int getGapHalf()
{
return _tprops.getDxaGapHalf();
}
public void setGapHalf(int dxaGapHalf)
{
_tprops.setDxaGapHalf(dxaGapHalf);
_papx.updateSprm(SPRM_DXAGAPHALF, (short)dxaGapHalf);
}
public void setGapHalf( int dxaGapHalf )
{
_tprops.setDxaGapHalf( dxaGapHalf );
_papx.updateSprm( SPRM_DXAGAPHALF, (short) dxaGapHalf );
}
public int getRowHeight()
{
return _tprops.getDyaRowHeight();
}
public int getRowHeight()
{
return _tprops.getDyaRowHeight();
}
public void setRowHeight(int dyaRowHeight)
{
_tprops.setDyaRowHeight(dyaRowHeight);
_papx.updateSprm(SPRM_DYAROWHEIGHT, (short)dyaRowHeight);
}
public void setRowHeight( int dyaRowHeight )
{
_tprops.setDyaRowHeight( dyaRowHeight );
_papx.updateSprm( SPRM_DYAROWHEIGHT, (short) dyaRowHeight );
}
public boolean cantSplit()
{
return _tprops.getFCantSplit();
}
public boolean cantSplit()
{
return _tprops.getFCantSplit();
}
public void setCantSplit(boolean cantSplit)
{
_tprops.setFCantSplit(cantSplit);
_papx.updateSprm(SPRM_FCANTSPLIT, (byte)(cantSplit ? 1 : 0));
}
public void setCantSplit( boolean cantSplit )
{
_tprops.setFCantSplit( cantSplit );
_papx.updateSprm( SPRM_FCANTSPLIT, (byte) (cantSplit ? 1 : 0) );
}
public boolean isTableHeader()
{
return _tprops.getFTableHeader();
}
public boolean isTableHeader()
{
return _tprops.getFTableHeader();
}
public void setTableHeader(boolean tableHeader)
{
_tprops.setFTableHeader(tableHeader);
_papx.updateSprm(SPRM_FTABLEHEADER, (byte)(tableHeader ? 1 : 0));
}
public void setTableHeader( boolean tableHeader )
{
_tprops.setFTableHeader( tableHeader );
_papx.updateSprm( SPRM_FTABLEHEADER, (byte) (tableHeader ? 1 : 0) );
}
public int numCells()
{
return _cells.length;
}
public int numCells()
{
return _cells.length;
}
public TableCell getCell(int index)
{
return _cells[index];
}
public TableCell getCell( int index )
{
return _cells[index];
}
public BorderCode getTopBorder() {
return _tprops.getBrcBottom();
}
public BorderCode getTopBorder()
{
return _tprops.getBrcBottom();
}
public BorderCode getBottomBorder() {
return _tprops.getBrcBottom();
}
public BorderCode getBottomBorder()
{
return _tprops.getBrcBottom();
}
public BorderCode getLeftBorder() {
return _tprops.getBrcLeft();
}
public BorderCode getLeftBorder()
{
return _tprops.getBrcLeft();
}
public BorderCode getRightBorder() {
return _tprops.getBrcRight();
}
public BorderCode getRightBorder()
{
return _tprops.getBrcRight();
}
public BorderCode getHorizontalBorder() {
return _tprops.getBrcHorizontal();
}
public BorderCode getHorizontalBorder()
{
return _tprops.getBrcHorizontal();
}
public BorderCode getVerticalBorder() {
return _tprops.getBrcVertical();
}
public BorderCode getVerticalBorder()
{
return _tprops.getBrcVertical();
}
public BorderCode getBarBorder() {
throw new UnsupportedOperationException("not applicable for TableRow");
}
public BorderCode getBarBorder()
{
throw new UnsupportedOperationException( "not applicable for TableRow" );
}
}

View File

@ -40,7 +40,8 @@ public class TestWordToConverterSuite
/**
* YK: a quick hack to exclude failing documents from the suite.
*/
private static List<String> failingFiles = Arrays.asList();
private static List<String> failingFiles = Arrays
.asList( "ProblemExtracting.doc" );
public static Test suite()
{

View File

@ -68,11 +68,6 @@ public class TestWordToHtmlConverter extends TestCase
assertTrue( result.substring( 0, 2000 ).contains( "<table>" ) );
}
public void testBug33519() throws Exception
{
getHtmlText( "Bug33519.doc" );
}
public void testBug46610_2() throws Exception
{
String result = getHtmlText( "Bug46610_2.doc" );

View File

@ -79,14 +79,14 @@ public final class TestRange extends TestCase
assertEquals( range.getStartOffset(), 0 );
assertEquals( range.getEndOffset(), 766 );
Paragraph lastInMainRange = range.getParagraph( range.numParagraphs() );
Paragraph lastInMainRange = range.getParagraph( range.numParagraphs() - 1);
assertTrue( lastInMainRange.getEndOffset() <= 766 );
Section section = range.getSection( 0 );
assertTrue( section.getEndOffset() <= 766 );
Paragraph lastInMainSection = section.getParagraph( section
.numParagraphs() );
.numParagraphs() - 1);
assertTrue( lastInMainSection.getEndOffset() <= 766 );
}
}