add TOC support

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1189145 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-10-26 11:52:59 +00:00
parent 5badcee102
commit 5cba03af8e
1 changed files with 127 additions and 31 deletions

View File

@ -78,6 +78,13 @@ public abstract class AbstractWordConverter
this.structure = field;
}
Structure( int start, int end )
{
this.start = start;
this.end = end;
this.structure = null;
}
public int compareTo( Structure o )
{
return start < o.start ? -1 : start == o.start ? 0 : 1;
@ -102,6 +109,15 @@ public abstract class AbstractWordConverter
private static final POILogger logger = POILogFactory
.getLogger( AbstractWordConverter.class );
private static final Pattern PATTERN_HYPERLINK_EXTERNAL = Pattern
.compile( "^[ \\t\\r\\n]*HYPERLINK \"(.*)\".*$" );
private static final Pattern PATTERN_HYPERLINK_LOCAL = Pattern
.compile( "^[ \\t\\r\\n]*HYPERLINK \\\\l \"(.*)\"[ ](.*)$" );
private static final Pattern PATTERN_PAGEREF = Pattern
.compile( "^[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h.*$" );
private static final byte SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE = 2;
private static final byte SPECCHAR_DRAWN_OBJECT = 8;
@ -291,17 +307,38 @@ public abstract class AbstractWordConverter
}
// TODO: dead fields?
int skipUntil = -1;
for ( int c = 0; c < range.numCharacterRuns(); c++ )
{
CharacterRun characterRun = range.getCharacterRun( c );
if ( characterRun == null )
throw new AssertionError();
if ( characterRun.getStartOffset() < skipUntil )
continue;
String text = characterRun.text();
if ( text == null || text.length() == 0
|| text.charAt( 0 ) != FIELD_BEGIN_MARK )
continue;
Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
characterRun.getStartOffset() );
if ( aliveField != null )
{
addToStructures( structures, new Structure( aliveField ) );
skipUntil = aliveField.getFieldEndOffset() + 1;
}
else
{
int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
wordDocument, range, c );
if ( separatorEnd != null )
{
addToStructures( structures, new Structure(
characterRun.getStartOffset(),
separatorEnd[1] + 1 ) );
c = separatorEnd[1];
}
}
}
}
@ -562,6 +599,39 @@ public abstract class AbstractWordConverter
Element currentBlock, Range range, int currentTableLevel,
int beginMark, int separatorMark, int endMark )
{
if ( beginMark + 1 < separatorMark && separatorMark + 1 < endMark )
{
Range formulaRange = new Range( range.getCharacterRun(
beginMark + 1 ).getStartOffset(), range.getCharacterRun(
separatorMark - 1 ).getEndOffset(), range )
{
@Override
public String toString()
{
return "Dead field formula subrange: " + super.toString();
}
};
Range valueRange = new Range( range.getCharacterRun(
separatorMark + 1 ).getStartOffset(), range
.getCharacterRun( endMark - 1 ).getEndOffset(), range )
{
@Override
public String toString()
{
return "Dead field value subrange: " + super.toString();
}
};
String formula = formulaRange.text();
final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher( formula );
if ( matcher.matches() )
{
String localref = matcher.group( 1 );
processPageref( wordDocument, currentBlock, valueRange,
currentTableLevel, localref );
return;
}
}
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
for ( int i = beginMark; i <= endMark; i++ )
{
@ -706,9 +776,7 @@ public abstract class AbstractWordConverter
if ( firstSubrange != null )
{
String formula = firstSubrange.text();
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
Matcher matcher = pagerefPattern.matcher( formula );
Matcher matcher = PATTERN_PAGEREF.matcher( formula );
if ( matcher.find() )
{
String pageref = matcher.group( 1 );
@ -756,10 +824,8 @@ public abstract class AbstractWordConverter
if ( firstSubrange != null )
{
String formula = firstSubrange.text();
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Matcher matcher = hyperlinkPattern.matcher( formula );
if ( matcher.find() )
Matcher matcher = PATTERN_HYPERLINK_EXTERNAL.matcher( formula );
if ( matcher.matches() )
{
String hyperlink = matcher.group( 1 );
processHyperlink( wordDocument, currentBlock,
@ -767,6 +833,30 @@ public abstract class AbstractWordConverter
currentTableLevel, hyperlink );
return;
}
matcher.usePattern( PATTERN_HYPERLINK_LOCAL );
if ( matcher.matches() )
{
String hyperlink = matcher.group( 1 );
Range textRange = null;
String text = matcher.group( 2 );
if ( AbstractWordUtils.isNotEmpty( text ) )
{
textRange = new Range( firstSubrange.getStartOffset()
+ matcher.start( 2 ),
firstSubrange.getStartOffset()
+ matcher.end( 2 ), firstSubrange )
{
@Override
public String toString()
{
return "Local hyperlink text";
}
};
}
processPageref( wordDocument, currentBlock, textRange,
currentTableLevel, hyperlink );
return;
}
}
break;
}
@ -817,13 +907,13 @@ public abstract class AbstractWordConverter
}
protected abstract void processImage( Element currentBlock,
boolean inlined, Picture picture, String url );
@Internal
protected abstract void processImageWithoutPicturesManager(
Element currentBlock, boolean inlined, Picture picture );
protected abstract void processImage( Element currentBlock,
boolean inlined, Picture picture, String url );
protected abstract void processLineBreak( Element block,
CharacterRun characterRun );
@ -1021,6 +1111,20 @@ public abstract class AbstractWordConverter
protected int tryDeadField( HWPFDocumentCore wordDocument, Range range,
int currentTableLevel, int beginMark, Element currentBlock )
{
int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
wordDocument, range, beginMark );
if ( separatorEnd == null )
return beginMark;
processDeadField( wordDocument, currentBlock, range, currentTableLevel,
beginMark, separatorEnd[0], separatorEnd[1] );
return separatorEnd[1];
}
private int[] tryDeadField_lookupFieldSeparatorEnd(
HWPFDocumentCore wordDocument, Range range, int beginMark )
{
int separatorMark = -1;
int endMark = -1;
@ -1032,28 +1136,24 @@ public abstract class AbstractWordConverter
if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
final byte firstByte = text.getBytes()[0];
if ( firstByte == FIELD_BEGIN_MARK )
{
// nested?
Field possibleField = processDeadField( wordDocument, range,
currentTableLevel, characterRun.getStartOffset(),
currentBlock );
if ( possibleField != null )
int[] nested = tryDeadField_lookupFieldSeparatorEnd(
wordDocument, range, c );
if ( nested != null )
{
c = possibleField.getFieldEndOffset();
}
else
{
continue;
c = nested[1];
}
continue;
}
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
if ( firstByte == FIELD_SEPARATOR_MARK )
{
if ( separatorMark != -1 )
{
// double;
return beginMark;
// double; incorrect format
return null;
}
separatorMark = c;
@ -1065,22 +1165,18 @@ public abstract class AbstractWordConverter
if ( endMark != -1 )
{
// double;
return beginMark;
return null;
}
endMark = c;
break;
}
}
if ( separatorMark == -1 || endMark == -1 )
return beginMark;
return null;
processDeadField( wordDocument, currentBlock, range, currentTableLevel,
beginMark, separatorMark, endMark );
return endMark;
return new int[] { separatorMark, endMark };
}
}