From 5cba03af8ed58747a361aa1cab7c8e9549041649 Mon Sep 17 00:00:00 2001 From: Sergey Vladimirov Date: Wed, 26 Oct 2011 11:52:59 +0000 Subject: [PATCH] add TOC support git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1189145 13f79535-47bb-0310-9956-ffa450edef68 --- .../hwpf/converter/AbstractWordConverter.java | 158 ++++++++++++++---- 1 file changed, 127 insertions(+), 31 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java index 5d35b2024..af4c0be1f 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java @@ -78,6 +78,13 @@ public abstract class AbstractWordConverter this.structure = field; } + Structure( int start, int end ) + { + this.start = start; + this.end = end; + this.structure = null; + } + public int compareTo( Structure o ) { return start < o.start ? -1 : start == o.start ? 0 : 1; @@ -102,6 +109,15 @@ public abstract class AbstractWordConverter private static final POILogger logger = POILogFactory .getLogger( AbstractWordConverter.class ); + private static final Pattern PATTERN_HYPERLINK_EXTERNAL = Pattern + .compile( "^[ \\t\\r\\n]*HYPERLINK \"(.*)\".*$" ); + + private static final Pattern PATTERN_HYPERLINK_LOCAL = Pattern + .compile( "^[ \\t\\r\\n]*HYPERLINK \\\\l \"(.*)\"[ ](.*)$" ); + + private static final Pattern PATTERN_PAGEREF = Pattern + .compile( "^[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h.*$" ); + private static final byte SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE = 2; private static final byte SPECCHAR_DRAWN_OBJECT = 8; @@ -291,17 +307,38 @@ public abstract class AbstractWordConverter } // TODO: dead fields? + int skipUntil = -1; for ( int c = 0; c < range.numCharacterRuns(); c++ ) { CharacterRun characterRun = range.getCharacterRun( c ); if ( characterRun == null ) throw new AssertionError(); + if ( characterRun.getStartOffset() < skipUntil ) + continue; + String text = characterRun.text(); + if ( text == null || text.length() == 0 + || text.charAt( 0 ) != FIELD_BEGIN_MARK ) + continue; + Field aliveField = ( (HWPFDocument) wordDocument ).getFields() .getFieldByStartOffset( FieldsDocumentPart.MAIN, characterRun.getStartOffset() ); if ( aliveField != null ) { addToStructures( structures, new Structure( aliveField ) ); + skipUntil = aliveField.getFieldEndOffset() + 1; + } + else + { + int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd( + wordDocument, range, c ); + if ( separatorEnd != null ) + { + addToStructures( structures, new Structure( + characterRun.getStartOffset(), + separatorEnd[1] + 1 ) ); + c = separatorEnd[1]; + } } } } @@ -562,6 +599,39 @@ public abstract class AbstractWordConverter Element currentBlock, Range range, int currentTableLevel, int beginMark, int separatorMark, int endMark ) { + if ( beginMark + 1 < separatorMark && separatorMark + 1 < endMark ) + { + Range formulaRange = new Range( range.getCharacterRun( + beginMark + 1 ).getStartOffset(), range.getCharacterRun( + separatorMark - 1 ).getEndOffset(), range ) + { + @Override + public String toString() + { + return "Dead field formula subrange: " + super.toString(); + } + }; + Range valueRange = new Range( range.getCharacterRun( + separatorMark + 1 ).getStartOffset(), range + .getCharacterRun( endMark - 1 ).getEndOffset(), range ) + { + @Override + public String toString() + { + return "Dead field value subrange: " + super.toString(); + } + }; + String formula = formulaRange.text(); + final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher( formula ); + if ( matcher.matches() ) + { + String localref = matcher.group( 1 ); + processPageref( wordDocument, currentBlock, valueRange, + currentTableLevel, localref ); + return; + } + } + StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); for ( int i = beginMark; i <= endMark; i++ ) { @@ -706,9 +776,7 @@ public abstract class AbstractWordConverter if ( firstSubrange != null ) { String formula = firstSubrange.text(); - Pattern pagerefPattern = Pattern - .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); - Matcher matcher = pagerefPattern.matcher( formula ); + Matcher matcher = PATTERN_PAGEREF.matcher( formula ); if ( matcher.find() ) { String pageref = matcher.group( 1 ); @@ -756,10 +824,8 @@ public abstract class AbstractWordConverter if ( firstSubrange != null ) { String formula = firstSubrange.text(); - Pattern hyperlinkPattern = Pattern - .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); - Matcher matcher = hyperlinkPattern.matcher( formula ); - if ( matcher.find() ) + Matcher matcher = PATTERN_HYPERLINK_EXTERNAL.matcher( formula ); + if ( matcher.matches() ) { String hyperlink = matcher.group( 1 ); processHyperlink( wordDocument, currentBlock, @@ -767,6 +833,30 @@ public abstract class AbstractWordConverter currentTableLevel, hyperlink ); return; } + matcher.usePattern( PATTERN_HYPERLINK_LOCAL ); + if ( matcher.matches() ) + { + String hyperlink = matcher.group( 1 ); + Range textRange = null; + String text = matcher.group( 2 ); + if ( AbstractWordUtils.isNotEmpty( text ) ) + { + textRange = new Range( firstSubrange.getStartOffset() + + matcher.start( 2 ), + firstSubrange.getStartOffset() + + matcher.end( 2 ), firstSubrange ) + { + @Override + public String toString() + { + return "Local hyperlink text"; + } + }; + } + processPageref( wordDocument, currentBlock, textRange, + currentTableLevel, hyperlink ); + return; + } } break; } @@ -817,13 +907,13 @@ public abstract class AbstractWordConverter } + protected abstract void processImage( Element currentBlock, + boolean inlined, Picture picture, String url ); + @Internal protected abstract void processImageWithoutPicturesManager( Element currentBlock, boolean inlined, Picture picture ); - protected abstract void processImage( Element currentBlock, - boolean inlined, Picture picture, String url ); - protected abstract void processLineBreak( Element block, CharacterRun characterRun ); @@ -1021,6 +1111,20 @@ public abstract class AbstractWordConverter protected int tryDeadField( HWPFDocumentCore wordDocument, Range range, int currentTableLevel, int beginMark, Element currentBlock ) + { + int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd( + wordDocument, range, beginMark ); + if ( separatorEnd == null ) + return beginMark; + + processDeadField( wordDocument, currentBlock, range, currentTableLevel, + beginMark, separatorEnd[0], separatorEnd[1] ); + + return separatorEnd[1]; + } + + private int[] tryDeadField_lookupFieldSeparatorEnd( + HWPFDocumentCore wordDocument, Range range, int beginMark ) { int separatorMark = -1; int endMark = -1; @@ -1032,28 +1136,24 @@ public abstract class AbstractWordConverter if ( text.getBytes().length == 0 ) continue; - if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) + final byte firstByte = text.getBytes()[0]; + if ( firstByte == FIELD_BEGIN_MARK ) { - // nested? - Field possibleField = processDeadField( wordDocument, range, - currentTableLevel, characterRun.getStartOffset(), - currentBlock ); - if ( possibleField != null ) + int[] nested = tryDeadField_lookupFieldSeparatorEnd( + wordDocument, range, c ); + if ( nested != null ) { - c = possibleField.getFieldEndOffset(); - } - else - { - continue; + c = nested[1]; } + continue; } - if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + if ( firstByte == FIELD_SEPARATOR_MARK ) { if ( separatorMark != -1 ) { - // double; - return beginMark; + // double; incorrect format + return null; } separatorMark = c; @@ -1065,22 +1165,18 @@ public abstract class AbstractWordConverter if ( endMark != -1 ) { // double; - return beginMark; + return null; } endMark = c; break; } - } if ( separatorMark == -1 || endMark == -1 ) - return beginMark; + return null; - processDeadField( wordDocument, currentBlock, range, currentTableLevel, - beginMark, separatorMark, endMark ); - - return endMark; + return new int[] { separatorMark, endMark }; } }