add Word-to-HTML extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1142765 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-04 19:08:06 +00:00
parent 2b97a034fe
commit 4fbd693851
11 changed files with 2291 additions and 837 deletions

View File

@ -0,0 +1,365 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.model.ListFormatOverride;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public abstract class AbstractWordExtractor
{
private static final byte BEL_MARK = 7;
private static final byte FIELD_BEGIN_MARK = 19;
private static final byte FIELD_END_MARK = 21;
private static final byte FIELD_SEPARATOR_MARK = 20;
private static final POILogger logger = POILogFactory
.getLogger( AbstractWordExtractor.class );
public abstract Document getDocument();
protected abstract void outputCharacters( Element block,
CharacterRun characterRun, String text );
protected boolean processCharacters( HWPFDocumentCore hwpfDocument,
int currentTableLevel, Paragraph paragraph, final Element block,
List<CharacterRun> characterRuns, final int start, final int end )
{
boolean haveAnyText = false;
for ( int c = start; c < end; c++ )
{
CharacterRun characterRun = characterRuns.get( c );
if ( characterRun == null )
throw new AssertionError();
if ( hwpfDocument instanceof HWPFDocument
&& ( (HWPFDocument) hwpfDocument ).getPicturesTable()
.hasPicture( characterRun ) )
{
HWPFDocument newFormat = (HWPFDocument) hwpfDocument;
Picture picture = newFormat.getPicturesTable().extractPicture(
characterRun, true );
processImage( block, characterRun.text().charAt( 0 ) == 0x01,
picture );
continue;
}
String text = characterRun.text();
if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{
int skipTo = tryField( hwpfDocument, paragraph,
currentTableLevel, characterRuns, c, block );
if ( skipTo != c )
{
c = skipTo;
continue;
}
continue;
}
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if ( text.getBytes()[0] == FIELD_END_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if ( characterRun.isSpecialCharacter() || characterRun.isObj()
|| characterRun.isOle2() )
{
continue;
}
if ( text.endsWith( "\r" )
|| ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0 ) )
text = text.substring( 0, text.length() - 1 );
outputCharacters( block, characterRun, text );
haveAnyText |= text.trim().length() != 0;
}
return haveAnyText;
}
public void processDocument( HWPFDocumentCore wordDocument )
{
final Range range = wordDocument.getRange();
for ( int s = 0; s < range.numSections(); s++ )
{
processSection( wordDocument, range.getSection( s ), s );
}
}
protected void processField( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
List<CharacterRun> characterRuns, int beginMark, int separatorMark,
int endMark )
{
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
if ( separatorMark - beginMark > 1 )
{
int index = beginMark + 1;
CharacterRun firstAfterBegin = null;
while ( index < separatorMark )
{
firstAfterBegin = paragraph.getCharacterRun( index );
if ( firstAfterBegin == null )
{
logger.log( POILogger.WARN,
"Paragraph " + paragraph.getStartOffset() + "--"
+ paragraph.getEndOffset()
+ " contains null CharacterRun #" + index );
index++;
continue;
}
break;
}
if ( firstAfterBegin != null )
{
final Matcher hyperlinkMatcher = hyperlinkPattern
.matcher( firstAfterBegin.text() );
if ( hyperlinkMatcher.matches() )
{
String hyperlink = hyperlinkMatcher.group( 1 );
processHyperlink( wordDocument, currentBlock, paragraph,
characterRuns, currentTableLevel, hyperlink,
separatorMark + 1, endMark );
return;
}
final Matcher pagerefMatcher = pagerefPattern
.matcher( firstAfterBegin.text() );
if ( pagerefMatcher.matches() )
{
String pageref = pagerefMatcher.group( 1 );
processPageref( wordDocument, currentBlock, paragraph,
characterRuns, currentTableLevel, pageref,
separatorMark + 1, endMark );
return;
}
}
}
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
for ( int i = beginMark; i <= endMark; i++ )
{
debug.append( "\t" );
debug.append( paragraph.getCharacterRun( i ) );
debug.append( "\n" );
}
logger.log( POILogger.WARN, debug );
// just output field value
if ( separatorMark + 1 < endMark )
processCharacters( wordDocument, currentTableLevel, paragraph,
currentBlock, characterRuns, separatorMark + 1, endMark );
return;
}
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String hyperlink, int i, int endMark );
protected abstract void processImage( Element currentBlock,
boolean inlined, Picture picture );
protected abstract void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String pageref, int beginTextInclusive, int endTextExclusive );
protected abstract void processParagraph( HWPFDocumentCore wordDocument,
Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText );
protected abstract void processSection( HWPFDocumentCore wordDocument,
Section section, int s );
protected void processSectionParagraphes( HWPFDocumentCore wordDocument,
Element flow, Range range, int currentTableLevel )
{
final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
for ( TableIterator tableIterator = AbstractWordUtils.newTableIterator(
range, currentTableLevel + 1 ); tableIterator.hasNext(); )
{
Table next = tableIterator.next();
allTables.put( Integer.valueOf( next.getStartOffset() ), next );
}
final ListTables listTables = wordDocument.getListTables();
int currentListInfo = 0;
final int paragraphs = range.numParagraphs();
for ( int p = 0; p < paragraphs; p++ )
{
Paragraph paragraph = range.getParagraph( p );
if ( allTables.containsKey( Integer.valueOf( paragraph
.getStartOffset() ) ) )
{
Table table = allTables.get( Integer.valueOf( paragraph
.getStartOffset() ) );
processTable( wordDocument, flow, table, currentTableLevel + 1 );
continue;
}
if ( paragraph.isInTable()
&& paragraph.getTableLevel() != currentTableLevel )
{
continue;
}
if ( paragraph.getIlfo() != currentListInfo )
{
currentListInfo = paragraph.getIlfo();
}
if ( currentListInfo != 0 )
{
if ( listTables != null )
{
final ListFormatOverride listFormatOverride = listTables
.getOverride( paragraph.getIlfo() );
String label = AbstractWordUtils.getBulletText( listTables,
paragraph, listFormatOverride.getLsid() );
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, label );
}
else
{
logger.log( POILogger.WARN,
"Paragraph #" + paragraph.getStartOffset() + "-"
+ paragraph.getEndOffset()
+ " has reference to list structure #"
+ currentListInfo
+ ", but listTables not defined in file" );
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, AbstractWordUtils.EMPTY );
}
}
else
{
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, AbstractWordUtils.EMPTY );
}
}
}
protected void processSingleSection( HWPFDocumentCore wordDocument,
Section section )
{
processSection( wordDocument, section, 0 );
}
protected abstract void processTable( HWPFDocumentCore wordDocument,
Element flow, Table table, int newTableLevel );
protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph,
int currentTableLevel, List<CharacterRun> characterRuns,
int beginMark, Element currentBlock )
{
int separatorMark = -1;
int endMark = -1;
for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
{
CharacterRun characterRun = paragraph.getCharacterRun( c );
String text = characterRun.text();
if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{
if ( separatorMark != -1 )
{
// double;
return beginMark;
}
separatorMark = c;
continue;
}
if ( text.getBytes()[0] == FIELD_END_MARK )
{
if ( endMark != -1 )
{
// double;
return beginMark;
}
endMark = c;
break;
}
}
if ( separatorMark == -1 || endMark == -1 )
return beginMark;
processField( wordDocument, currentBlock, paragraph, currentTableLevel,
characterRuns, beginMark, separatorMark, endMark );
return endMark;
}
}

View File

@ -0,0 +1,404 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.ListLevel;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
public class AbstractWordUtils
{
static final String EMPTY = "";
private static final POILogger logger = POILogFactory
.getLogger( AbstractWordUtils.class );
public static final float TWIPS_PER_INCH = 1440.0f;
public static final int TWIPS_PER_PT = 20;
static void closeQuietly( final Closeable closeable )
{
try
{
closeable.close();
}
catch ( Exception exc )
{
logger.log( POILogger.ERROR, "Unable to close resource: " + exc,
exc );
}
}
static boolean equals( String str1, String str2 )
{
return str1 == null ? str2 == null : str1.equals( str2 );
}
// XXX incorporate into Range
static List<CharacterRun> findCharacterRuns( Range range )
{
final int min = range.getStartOffset();
final int max = range.getEndOffset();
List<CharacterRun> result = new ArrayList<CharacterRun>();
List<CHPX> chpxs = getCharacters( range );
for ( int i = 0; i < chpxs.size(); i++ )
{
CHPX chpx = chpxs.get( i );
if ( chpx == null )
continue;
if ( Math.max( min, chpx.getStart() ) <= Math.min( max,
chpx.getEnd() ) )
{
final CharacterRun characterRun = getCharacterRun( range, chpx );
if ( characterRun == null )
continue;
result.add( characterRun );
}
}
return result;
}
public static String getBorderType( BorderCode borderCode )
{
if ( borderCode == null )
throw new IllegalArgumentException( "borderCode is null" );
switch ( borderCode.getBorderType() )
{
case 1:
case 2:
return "solid";
case 3:
return "double";
case 5:
return "solid";
case 6:
return "dotted";
case 7:
case 8:
return "dashed";
case 9:
return "dotted";
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
return "double";
case 20:
return "solid";
case 21:
return "double";
case 22:
return "dashed";
case 23:
return "dashed";
case 24:
return "ridge";
case 25:
return "grooved";
default:
return "solid";
}
}
public static String getBorderWidth( BorderCode borderCode )
{
int lineWidth = borderCode.getLineWidth();
int pt = lineWidth / 8;
int pte = lineWidth - pt * 8;
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append( pt );
stringBuilder.append( "." );
stringBuilder.append( 1000 / 8 * pte );
stringBuilder.append( "pt" );
return stringBuilder.toString();
}
public static String getBulletText( ListTables listTables,
Paragraph paragraph, int listId )
{
final ListLevel listLevel = listTables.getLevel( listId,
paragraph.getIlvl() );
if ( listLevel.getNumberText() == null )
return EMPTY;
StringBuffer bulletBuffer = new StringBuffer();
char[] xst = listLevel.getNumberText().toCharArray();
for ( char element : xst )
{
if ( element < 9 )
{
ListLevel numLevel = listTables.getLevel( listId, element );
int num = numLevel.getStartAt();
bulletBuffer.append( NumberFormatter.getNumber( num,
listLevel.getNumberFormat() ) );
if ( numLevel == listLevel )
{
numLevel.setStartAt( numLevel.getStartAt() + 1 );
}
}
else
{
bulletBuffer.append( element );
}
}
byte follow = getIxchFollow( listLevel );
switch ( follow )
{
case 0:
bulletBuffer.append( "\t" );
break;
case 1:
bulletBuffer.append( " " );
break;
default:
break;
}
return bulletBuffer.toString();
}
private static CharacterRun getCharacterRun( Range range, CHPX chpx )
{
try
{
Method method = Range.class.getDeclaredMethod( "getCharacterRun",
CHPX.class );
method.setAccessible( true );
return (CharacterRun) method.invoke( range, chpx );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
private static List<CHPX> getCharacters( Range range )
{
try
{
Field field = Range.class.getDeclaredField( "_characters" );
field.setAccessible( true );
return (List<CHPX>) field.get( range );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
public static String getColor( int ico )
{
switch ( ico )
{
case 1:
return "black";
case 2:
return "blue";
case 3:
return "cyan";
case 4:
return "green";
case 5:
return "magenta";
case 6:
return "red";
case 7:
return "yellow";
case 8:
return "white";
case 9:
return "darkblue";
case 10:
return "darkcyan";
case 11:
return "darkgreen";
case 12:
return "darkmagenta";
case 13:
return "darkred";
case 14:
return "darkyellow";
case 15:
return "darkgray";
case 16:
return "lightgray";
default:
return "black";
}
}
public static byte getIxchFollow( ListLevel listLevel )
{
try
{
Field field = ListLevel.class.getDeclaredField( "_ixchFollow" );
field.setAccessible( true );
return ( (Byte) field.get( listLevel ) ).byteValue();
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
public static String getJustification( int js )
{
switch ( js )
{
case 0:
return "start";
case 1:
return "center";
case 2:
return "end";
case 3:
case 4:
return "justify";
case 5:
return "center";
case 6:
return "left";
case 7:
return "start";
case 8:
return "end";
case 9:
return "justify";
}
return "";
}
public static String getListItemNumberLabel( int number, int format )
{
if ( format != 0 )
System.err.println( "NYI: toListItemNumberLabel(): " + format );
return String.valueOf( number );
}
public static SectionProperties getSectionProperties( Section section )
{
try
{
Field field = Section.class.getDeclaredField( "_props" );
field.setAccessible( true );
return (SectionProperties) field.get( section );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
static boolean isEmpty( String str )
{
return str == null || str.length() == 0;
}
static boolean isNotEmpty( String str )
{
return !isEmpty( str );
}
public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
{
final FileInputStream istream = new FileInputStream( docFile );
try
{
return loadDoc( istream );
}
finally
{
closeQuietly( istream );
}
}
public static HWPFDocumentCore loadDoc( InputStream inputStream )
throws IOException
{
final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
.verifyAndBuildPOIFS( inputStream );
try
{
return new HWPFDocument( poifsFileSystem );
}
catch ( OldWordFileFormatException exc )
{
return new HWPFOldDocument( poifsFileSystem );
}
}
public static TableIterator newTableIterator( Range range, int level )
{
try
{
Constructor<TableIterator> constructor = TableIterator.class
.getDeclaredConstructor( Range.class, int.class );
constructor.setAccessible( true );
return constructor.newInstance( range, Integer.valueOf( level ) );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
}

View File

@ -1,37 +1,34 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
public abstract class AbstractToFoExtractor
public class FoDocumentFacade
{
private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
protected final Document document;
protected final Element layoutMasterSet;
protected final Element root;
public AbstractToFoExtractor( Document document )
public FoDocumentFacade( Document document )
{
this.document = document;
@ -43,7 +40,7 @@ public abstract class AbstractToFoExtractor
root.appendChild( layoutMasterSet );
}
protected Element addFlowToPageSequence( final Element pageSequence,
public Element addFlowToPageSequence( final Element pageSequence,
String flowName )
{
final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
@ -53,28 +50,28 @@ public abstract class AbstractToFoExtractor
return flow;
}
protected Element addListItem( Element listBlock )
public Element addListItem( Element listBlock )
{
Element result = createListItem();
listBlock.appendChild( result );
return result;
}
protected Element addListItemBody( Element listItem )
public Element addListItemBody( Element listItem )
{
Element result = createListItemBody();
listItem.appendChild( result );
return result;
}
protected Element addListItemLabel( Element listItem, String text )
public Element addListItemLabel( Element listItem, String text )
{
Element result = createListItemLabel( text );
listItem.appendChild( result );
return result;
}
protected Element addPageSequence( String pageMaster )
public Element addPageSequence( String pageMaster )
{
final Element pageSequence = document.createElementNS( NS_XSLFO,
"fo:page-sequence" );
@ -83,7 +80,7 @@ public abstract class AbstractToFoExtractor
return pageSequence;
}
protected Element addRegionBody( Element pageMaster )
public Element addRegionBody( Element pageMaster )
{
final Element regionBody = document.createElementNS( NS_XSLFO,
"fo:region-body" );
@ -92,7 +89,7 @@ public abstract class AbstractToFoExtractor
return regionBody;
}
protected Element addSimplePageMaster( String masterName )
public Element addSimplePageMaster( String masterName )
{
final Element simplePageMaster = document.createElementNS( NS_XSLFO,
"fo:simple-page-master" );
@ -110,7 +107,7 @@ public abstract class AbstractToFoExtractor
return basicLink;
}
protected Element createBasicLinkInternal( String internalDestination )
public Element createBasicLinkInternal( String internalDestination )
{
final Element basicLink = document.createElementNS( NS_XSLFO,
"fo:basic-link" );
@ -118,12 +115,12 @@ public abstract class AbstractToFoExtractor
return basicLink;
}
protected Element createBlock()
public Element createBlock()
{
return document.createElementNS( NS_XSLFO, "fo:block" );
}
protected Element createExternalGraphic( String source )
public Element createExternalGraphic( String source )
{
Element result = document.createElementNS( NS_XSLFO,
"fo:external-graphic" );
@ -131,32 +128,32 @@ public abstract class AbstractToFoExtractor
return result;
}
protected Element createInline()
public Element createInline()
{
return document.createElementNS( NS_XSLFO, "fo:inline" );
}
protected Element createLeader()
public Element createLeader()
{
return document.createElementNS( NS_XSLFO, "fo:leader" );
}
protected Element createListBlock()
public Element createListBlock()
{
return document.createElementNS( NS_XSLFO, "fo:list-block" );
}
protected Element createListItem()
public Element createListItem()
{
return document.createElementNS( NS_XSLFO, "fo:list-item" );
}
protected Element createListItemBody()
public Element createListItemBody()
{
return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
}
protected Element createListItemLabel( String text )
public Element createListItemLabel( String text )
{
Element result = document.createElementNS( NS_XSLFO,
"fo:list-item-label" );

View File

@ -0,0 +1,107 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
public class HtmlDocumentFacade
{
protected final Element body;
protected final Document document;
protected final Element head;
protected final Element html;
public HtmlDocumentFacade( Document document )
{
this.document = document;
html = document.createElement( "html" );
document.appendChild( html );
body = document.createElement( "body" );
head = document.createElement( "head" );
html.appendChild( head );
html.appendChild( body );
}
public Element createHyperlink( String internalDestination )
{
final Element basicLink = document.createElement( "a" );
basicLink.setAttribute( "href", internalDestination );
return basicLink;
}
public Element createListItem()
{
return document.createElement( "li" );
}
public Element createParagraph()
{
return document.createElement( "p" );
}
public Element createTable()
{
return document.createElement( "table" );
}
public Element createTableBody()
{
return document.createElement( "tbody" );
}
public Element createTableCell()
{
return document.createElement( "td" );
}
public Element createTableHeader()
{
return document.createElement( "thead" );
}
public Element createTableHeaderCell()
{
return document.createElement( "th" );
}
public Element createTableRow()
{
return document.createElement( "tr" );
}
public Text createText( String data )
{
return document.createTextNode( data );
}
public Element createUnorderedList()
{
return document.createElement( "ul" );
}
public Document getDocument()
{
return document;
}
}

View File

@ -1,32 +1,27 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
@ -36,8 +31,10 @@ import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.model.ListFormatOverride;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
@ -54,12 +51,10 @@ import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
/**
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
public class WordToFoExtractor extends AbstractToFoExtractor
public class WordToFoExtractor extends AbstractWordExtractor
{
/**
@ -84,35 +79,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor
}
}
private static final byte BEL_MARK = 7;
private static final byte FIELD_BEGIN_MARK = 19;
private static final byte FIELD_END_MARK = 21;
private static final byte FIELD_SEPARATOR_MARK = 20;
private static final POILogger logger = POILogFactory
.getLogger( WordToFoExtractor.class );
private static HWPFDocument loadDoc( File docFile ) throws IOException
public static String getBorderType( BorderCode borderCode )
{
final FileInputStream istream = new FileInputStream( docFile );
try
if ( borderCode == null )
throw new IllegalArgumentException( "borderCode is null" );
switch ( borderCode.getBorderType() )
{
return new HWPFDocument( istream );
}
finally
{
try
{
istream.close();
}
catch ( Exception exc )
{
logger.log( POILogger.ERROR,
"Unable to close FileInputStream: " + exc, exc );
}
case 1:
case 2:
return "solid";
case 3:
return "double";
case 5:
return "solid";
case 6:
return "dotted";
case 7:
case 8:
return "dashed";
case 9:
return "dotted";
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
return "double";
case 20:
return "solid";
case 21:
return "double";
case 22:
return "dashed";
case 23:
return "dashed";
case 24:
return "ridge";
case 25:
return "grooved";
default:
return "solid";
}
}
@ -160,7 +175,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
static Document process( File docFile ) throws Exception
{
final HWPFDocument hwpfDocument = loadDoc( docFile );
final HWPFDocumentCore hwpfDocument = WordToFoUtils.loadDoc( docFile );
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
@ -170,6 +185,8 @@ public class WordToFoExtractor extends AbstractToFoExtractor
private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
protected final FoDocumentFacade foDocumentFacade;
/**
* Creates new instance of {@link WordToFoExtractor}. Can be used for output
* several {@link HWPFDocument}s into single FO document.
@ -180,27 +197,28 @@ public class WordToFoExtractor extends AbstractToFoExtractor
*/
public WordToFoExtractor( Document document )
{
super( document );
this.foDocumentFacade = new FoDocumentFacade( document );
}
protected String createPageMaster( SectionProperties sep, String type,
int section )
{
float height = sep.getYaPage() / TWIPS_PER_INCH;
float width = sep.getXaPage() / TWIPS_PER_INCH;
float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
float height = sep.getYaPage() / WordToFoUtils.TWIPS_PER_INCH;
float width = sep.getXaPage() / WordToFoUtils.TWIPS_PER_INCH;
float leftMargin = sep.getDxaLeft() / WordToFoUtils.TWIPS_PER_INCH;
float rightMargin = sep.getDxaRight() / WordToFoUtils.TWIPS_PER_INCH;
float topMargin = sep.getDyaTop() / WordToFoUtils.TWIPS_PER_INCH;
float bottomMargin = sep.getDyaBottom() / WordToFoUtils.TWIPS_PER_INCH;
// add these to the header
String pageMasterName = type + "-page" + section;
Element pageMaster = addSimplePageMaster( pageMasterName );
Element pageMaster = foDocumentFacade
.addSimplePageMaster( pageMasterName );
pageMaster.setAttribute( "page-height", height + "in" );
pageMaster.setAttribute( "page-width", width + "in" );
Element regionBody = addRegionBody( pageMaster );
Element regionBody = foDocumentFacade.addRegionBody( pageMaster );
regionBody.setAttribute( "margin", topMargin + "in " + rightMargin
+ "in " + bottomMargin + "in " + leftMargin + "in" );
@ -216,12 +234,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor
if ( sep.getCcolM1() > 0 )
{
regionBody
.setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
regionBody.setAttribute( "column-count", ""
+ ( sep.getCcolM1() + 1 ) );
if ( sep.getFEvenlySpaced() )
{
regionBody.setAttribute( "column-gap",
(sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
( sep.getDxaColumns() / WordToFoUtils.TWIPS_PER_INCH )
+ "in" );
}
else
{
@ -232,171 +251,55 @@ public class WordToFoExtractor extends AbstractToFoExtractor
return pageMasterName;
}
protected boolean processCharacters( HWPFDocument hwpfDocument,
int currentTableLevel, Paragraph paragraph, final Element block,
final int start, final int end )
public Document getDocument()
{
boolean haveAnyText = false;
for ( int c = start; c < end; c++ )
{
CharacterRun characterRun = paragraph.getCharacterRun( c );
if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
{
Picture picture = hwpfDocument.getPicturesTable()
.extractPicture( characterRun, true );
processImage( block, characterRun.text().charAt( 0 ) == 0x01,
picture );
continue;
}
String text = characterRun.text();
if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{
int skipTo = tryField( hwpfDocument, paragraph,
currentTableLevel, c, block );
if ( skipTo != c )
{
c = skipTo;
continue;
}
continue;
}
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if ( text.getBytes()[0] == FIELD_END_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if ( characterRun.isSpecialCharacter() || characterRun.isObj()
|| characterRun.isOle2() )
{
continue;
}
BlockProperies blockProperies = this.blocksProperies.peek();
Element inline = createInline();
if ( characterRun.isBold() != blockProperies.pBold )
{
WordToFoUtils.setBold( inline, characterRun.isBold() );
}
if ( characterRun.isItalic() != blockProperies.pItalic )
{
WordToFoUtils.setItalic( inline, characterRun.isItalic() );
}
if ( !WordToFoUtils.equals( characterRun.getFontName(),
blockProperies.pFontName ) )
{
WordToFoUtils
.setFontFamily( inline, characterRun.getFontName() );
}
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
{
WordToFoUtils.setFontSize( inline,
characterRun.getFontSize() / 2 );
}
WordToFoUtils.setCharactersProperties( characterRun, inline );
block.appendChild( inline );
if ( text.endsWith( "\r" )
|| (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
text = text.substring( 0, text.length() - 1 );
Text textNode = createText( text );
inline.appendChild( textNode );
haveAnyText |= text.trim().length() != 0;
}
return haveAnyText;
return foDocumentFacade.getDocument();
}
public void processDocument( HWPFDocument hwpfDocument )
@Override
protected void outputCharacters( Element block, CharacterRun characterRun,
String text )
{
final Range range = hwpfDocument.getRange();
for ( int s = 0; s < range.numSections(); s++ )
BlockProperies blockProperies = this.blocksProperies.peek();
Element inline = foDocumentFacade.createInline();
if ( characterRun.isBold() != blockProperies.pBold )
{
processSection( hwpfDocument, range.getSection( s ), s );
WordToFoUtils.setBold( inline, characterRun.isBold() );
}
if ( characterRun.isItalic() != blockProperies.pItalic )
{
WordToFoUtils.setItalic( inline, characterRun.isItalic() );
}
if ( characterRun.getFontName() != null
&& !AbstractWordUtils.equals( characterRun.getFontName(),
blockProperies.pFontName ) )
{
WordToFoUtils.setFontFamily( inline, characterRun.getFontName() );
}
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
{
WordToFoUtils.setFontSize( inline, characterRun.getFontSize() / 2 );
}
WordToFoUtils.setCharactersProperties( characterRun, inline );
block.appendChild( inline );
Text textNode = foDocumentFacade.createText( text );
inline.appendChild( textNode );
}
protected void processField( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
int beginMark, int separatorMark, int endMark )
{
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
if ( separatorMark - beginMark > 1 )
{
CharacterRun firstAfterBegin = paragraph
.getCharacterRun( beginMark + 1 );
final Matcher hyperlinkMatcher = hyperlinkPattern
.matcher( firstAfterBegin.text() );
if ( hyperlinkMatcher.matches() )
{
String hyperlink = hyperlinkMatcher.group( 1 );
processHyperlink( hwpfDocument, currentBlock, paragraph,
currentTableLevel, hyperlink, separatorMark + 1,
endMark );
return;
}
final Matcher pagerefMatcher = pagerefPattern
.matcher( firstAfterBegin.text() );
if ( pagerefMatcher.matches() )
{
String pageref = pagerefMatcher.group( 1 );
processPageref( hwpfDocument, currentBlock, paragraph,
currentTableLevel, pageref, separatorMark + 1, endMark );
return;
}
}
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
for ( int i = beginMark; i <= endMark; i++ )
{
debug.append( "\t" );
debug.append( paragraph.getCharacterRun( i ) );
debug.append( "\n" );
}
logger.log( POILogger.WARN, debug );
// just output field value
if ( separatorMark + 1 < endMark )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
currentBlock, separatorMark + 1, endMark );
return;
}
protected void processHyperlink( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
protected void processHyperlink( HWPFDocumentCore hwpfDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String hyperlink, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = createBasicLinkExternal( hyperlink );
Element basicLink = foDocumentFacade
.createBasicLinkExternal( hyperlink );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
basicLink, beginTextInclusive, endTextExclusive );
basicLink, characterRuns, beginTextInclusive,
endTextExclusive );
}
/**
@ -422,27 +325,30 @@ public class WordToFoExtractor extends AbstractToFoExtractor
Picture picture )
{
// no default implementation -- skip
currentBlock.appendChild( document.createComment( "Image link to '"
+ picture.suggestFullFileName() + "' can be here" ) );
currentBlock.appendChild( foDocumentFacade.getDocument().createComment(
"Image link to '" + picture.suggestFullFileName()
+ "' can be here" ) );
}
protected void processPageref( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
protected void processPageref( HWPFDocumentCore hwpfDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String pageref, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = createBasicLinkInternal( pageref );
Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
basicLink, beginTextInclusive, endTextExclusive );
basicLink, characterRuns, beginTextInclusive,
endTextExclusive );
}
protected void processParagraph( HWPFDocument hwpfDocument,
protected void processParagraph( HWPFDocumentCore hwpfDocument,
Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText )
{
final Element block = createBlock();
final Element block = foDocumentFacade.createBlock();
parentFopElement.appendChild( block );
WordToFoUtils.setParagraphProperties( paragraph, block );
@ -480,21 +386,23 @@ public class WordToFoExtractor extends AbstractToFoExtractor
if ( WordToFoUtils.isNotEmpty( bulletText ) )
{
Element inline = createInline();
Element inline = foDocumentFacade.createInline();
block.appendChild( inline );
Text textNode = createText( bulletText );
Text textNode = foDocumentFacade.createText( bulletText );
inline.appendChild( textNode );
haveAnyText |= bulletText.trim().length() != 0;
}
List<CharacterRun> characterRuns = WordToFoUtils
.findCharacterRuns( paragraph );
haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
paragraph, block, 0, charRuns );
paragraph, block, characterRuns, 0, characterRuns.size() );
if ( !haveAnyText )
{
Element leader = createLeader();
Element leader = foDocumentFacade.createLeader();
block.appendChild( leader );
}
}
@ -506,20 +414,21 @@ public class WordToFoExtractor extends AbstractToFoExtractor
return;
}
protected void processSection( HWPFDocument hwpfDocument, Section section,
int sectionCounter )
protected void processSection( HWPFDocumentCore wordDocument,
Section section, int sectionCounter )
{
String regularPage = createPageMaster(
WordToFoUtils.getSectionProperties( section ), "page",
sectionCounter );
Element pageSequence = addPageSequence( regularPage );
Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" );
Element pageSequence = foDocumentFacade.addPageSequence( regularPage );
Element flow = foDocumentFacade.addFlowToPageSequence( pageSequence,
"xsl-region-body" );
processSectionParagraphes( hwpfDocument, flow, section, 0 );
processSectionParagraphes( wordDocument, flow, section, 0 );
}
protected void processSectionParagraphes( HWPFDocument hwpfDocument,
protected void processSectionParagraphes( HWPFDocument wordDocument,
Element flow, Range range, int currentTableLevel )
{
final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
@ -530,7 +439,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
allTables.put( Integer.valueOf( next.getStartOffset() ), next );
}
final ListTables listTables = hwpfDocument.getListTables();
final ListTables listTables = wordDocument.getListTables();
int currentListInfo = 0;
final int paragraphs = range.numParagraphs();
@ -543,7 +452,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
{
Table table = allTables.get( Integer.valueOf( paragraph
.getStartOffset() ) );
processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
processTable( wordDocument, flow, table, currentTableLevel + 1 );
continue;
}
@ -568,7 +477,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
String label = WordToFoUtils.getBulletText( listTables,
paragraph, listFormatOverride.getLsid() );
processParagraph( hwpfDocument, flow, currentTableLevel,
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, label );
}
else
@ -580,24 +489,24 @@ public class WordToFoExtractor extends AbstractToFoExtractor
+ currentListInfo
+ ", but listTables not defined in file" );
processParagraph( hwpfDocument, flow, currentTableLevel,
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, WordToFoUtils.EMPTY );
}
}
else
{
processParagraph( hwpfDocument, flow, currentTableLevel,
processParagraph( wordDocument, flow, currentTableLevel,
paragraph, WordToFoUtils.EMPTY );
}
}
}
protected void processTable( HWPFDocument hwpfDocument, Element flow,
protected void processTable( HWPFDocumentCore wordDocument, Element flow,
Table table, int thisTableLevel )
{
Element tableHeader = createTableHeader();
Element tableBody = createTableBody();
Element tableHeader = foDocumentFacade.createTableHeader();
Element tableBody = foDocumentFacade.createTableBody();
final int tableRows = table.numRows();
@ -611,7 +520,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
{
TableRow tableRow = table.getRow( r );
Element tableRowElement = createTableRow();
Element tableRowElement = foDocumentFacade.createTableRow();
WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
final int rowCells = tableRow.numCells();
@ -626,7 +535,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
&& !tableCell.isFirstVerticallyMerged() )
continue;
Element tableCellElement = createTableCell();
Element tableCellElement = foDocumentFacade.createTableCell();
WordToFoUtils.setTableCellProperties( tableRow, tableCell,
tableCellElement, r == 0, r == tableRows - 1, c == 0,
c == rowCells - 1 );
@ -649,9 +558,9 @@ public class WordToFoExtractor extends AbstractToFoExtractor
{
if ( c == rowCells - 1 && c != maxColumns - 1 )
{
tableCellElement
.setAttribute( "number-columns-spanned", ""
+ (maxColumns - c) );
tableCellElement.setAttribute(
"number-columns-spanned", ""
+ ( maxColumns - c ) );
}
}
@ -673,12 +582,13 @@ public class WordToFoExtractor extends AbstractToFoExtractor
+ count );
}
processSectionParagraphes( hwpfDocument, tableCellElement,
processSectionParagraphes( wordDocument, tableCellElement,
tableCell, thisTableLevel );
if ( !tableCellElement.hasChildNodes() )
{
tableCellElement.appendChild( createBlock() );
tableCellElement.appendChild( foDocumentFacade
.createBlock() );
}
tableRowElement.appendChild( tableCellElement );
@ -694,7 +604,7 @@ public class WordToFoExtractor extends AbstractToFoExtractor
}
}
final Element tableElement = createTable();
final Element tableElement = foDocumentFacade.createTable();
if ( tableHeader.hasChildNodes() )
{
tableElement.appendChild( tableHeader );
@ -714,51 +624,4 @@ public class WordToFoExtractor extends AbstractToFoExtractor
}
}
protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
int currentTableLevel, int beginMark, Element currentBlock )
{
int separatorMark = -1;
int endMark = -1;
for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
{
CharacterRun characterRun = paragraph.getCharacterRun( c );
String text = characterRun.text();
if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{
if ( separatorMark != -1 )
{
// double;
return beginMark;
}
separatorMark = c;
continue;
}
if ( text.getBytes()[0] == FIELD_END_MARK )
{
if ( endMark != -1 )
{
// double;
return beginMark;
}
endMark = c;
break;
}
}
if ( separatorMark == -1 || endMark == -1 )
return beginMark;
processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
beginMark, separatorMark, endMark );
return endMark;
}
}

View File

@ -1,489 +1,323 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
import org.apache.poi.hwpf.model.ListLevel;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterProperties;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.w3c.dom.Element;
public class WordToFoUtils {
static final String EMPTY = "";
public static final float TWIPS_PER_INCH = 1440.0f;
public static final int TWIPS_PER_PT = 20;
static boolean equals(String str1, String str2) {
return str1 == null ? str2 == null : str1.equals(str2);
public class WordToFoUtils extends AbstractWordUtils
{
public static void setBold( final Element element, final boolean bold )
{
element.setAttribute( "font-weight", bold ? "bold" : "normal" );
}
public static String getBorderType(BorderCode borderCode) {
if (borderCode == null)
throw new IllegalArgumentException("borderCode is null");
public static void setBorder( Element element, BorderCode borderCode,
String where )
{
if ( element == null )
throw new IllegalArgumentException( "element is null" );
switch (borderCode.getBorderType()) {
case 1:
case 2:
return "solid";
case 3:
return "double";
case 5:
return "solid";
case 6:
return "dotted";
case 7:
case 8:
return "dashed";
case 9:
return "dotted";
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
return "double";
case 20:
return "solid";
case 21:
return "double";
case 22:
return "dashed";
case 23:
return "dashed";
case 24:
return "ridge";
case 25:
return "grooved";
default:
return "solid";
}
}
if ( borderCode == null || borderCode.getBorderType() == 0 )
return;
public static String getBorderWidth(BorderCode borderCode) {
int lineWidth = borderCode.getLineWidth();
int pt = lineWidth / 8;
int pte = lineWidth - pt * 8;
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append(pt);
stringBuilder.append(".");
stringBuilder.append(1000 / 8 * pte);
stringBuilder.append("pt");
return stringBuilder.toString();
}
public static String getBulletText(ListTables listTables,
Paragraph paragraph, int listId) {
final ListLevel listLevel = listTables.getLevel(listId,
paragraph.getIlvl());
if (listLevel.getNumberText() == null)
return EMPTY;
StringBuffer bulletBuffer = new StringBuffer();
char[] xst = listLevel.getNumberText().toCharArray();
for (char element : xst) {
if (element < 9) {
ListLevel numLevel = listTables.getLevel(listId, element);
int num = numLevel.getStartAt();
bulletBuffer.append(NumberFormatter.getNumber(num,
listLevel.getNumberFormat()));
if (numLevel == listLevel) {
numLevel.setStartAt(numLevel.getStartAt() + 1);
}
} else {
bulletBuffer.append(element);
}
}
byte follow = getIxchFollow(listLevel);
switch (follow) {
case 0:
bulletBuffer.append("\t");
break;
case 1:
bulletBuffer.append(" ");
break;
default:
break;
}
return bulletBuffer.toString();
}
public static String getColor(int ico) {
switch (ico) {
case 1:
return "black";
case 2:
return "blue";
case 3:
return "cyan";
case 4:
return "green";
case 5:
return "magenta";
case 6:
return "red";
case 7:
return "yellow";
case 8:
return "white";
case 9:
return "darkblue";
case 10:
return "darkcyan";
case 11:
return "darkgreen";
case 12:
return "darkmagenta";
case 13:
return "darkred";
case 14:
return "darkyellow";
case 15:
return "darkgray";
case 16:
return "lightgray";
default:
return "black";
}
}
public static byte getIxchFollow(ListLevel listLevel) {
try {
Field field = ListLevel.class.getDeclaredField("_ixchFollow");
field.setAccessible(true);
return ((Byte) field.get(listLevel)).byteValue();
} catch (Exception exc) {
throw new Error(exc);
}
}
public static String getJustification(int js) {
switch (js) {
case 0:
return "start";
case 1:
return "center";
case 2:
return "end";
case 3:
case 4:
return "justify";
case 5:
return "center";
case 6:
return "left";
case 7:
return "start";
case 8:
return "end";
case 9:
return "justify";
if ( isEmpty( where ) )
{
element.setAttribute( "border-style", getBorderType( borderCode ) );
element.setAttribute( "border-color",
getColor( borderCode.getColor() ) );
element.setAttribute( "border-width", getBorderWidth( borderCode ) );
}
else
{
element.setAttribute( "border-" + where + "-style",
getBorderType( borderCode ) );
element.setAttribute( "border-" + where + "-color",
getColor( borderCode.getColor() ) );
element.setAttribute( "border-" + where + "-width",
getBorderWidth( borderCode ) );
}
return "";
}
public static String getListItemNumberLabel(int number, int format) {
if (format != 0)
System.err.println("NYI: toListItemNumberLabel(): " + format);
return String.valueOf(number);
}
public static SectionProperties getSectionProperties(Section section) {
try {
Field field = Section.class.getDeclaredField("_props");
field.setAccessible(true);
return (SectionProperties) field.get(section);
} catch (Exception exc) {
throw new Error(exc);
}
}
static boolean isEmpty(String str) {
return str == null || str.length() == 0;
}
static boolean isNotEmpty(String str) {
return !isEmpty(str);
}
public static TableIterator newTableIterator(Range range, int level) {
try {
Constructor<TableIterator> constructor = TableIterator.class
.getDeclaredConstructor(Range.class, int.class);
constructor.setAccessible(true);
return constructor.newInstance(range, Integer.valueOf(level));
} catch (Exception exc) {
throw new Error(exc);
}
}
public static void setBold(final Element element, final boolean bold) {
element.setAttribute("font-weight", bold ? "bold" : "normal");
}
public static void setBorder(Element element, BorderCode borderCode,
String where) {
if (element == null)
throw new IllegalArgumentException("element is null");
if (borderCode == null)
return;
if (isEmpty(where)) {
element.setAttribute("border-style", getBorderType(borderCode));
element.setAttribute("border-color",
getColor(borderCode.getColor()));
element.setAttribute("border-width", getBorderWidth(borderCode));
} else {
element.setAttribute("border-" + where + "-style",
getBorderType(borderCode));
element.setAttribute("border-" + where + "-color",
getColor(borderCode.getColor()));
element.setAttribute("border-" + where + "-width",
getBorderWidth(borderCode));
}
}
public static void setCharactersProperties(final CharacterRun characterRun,
final Element inline) {
public static void setCharactersProperties(
final CharacterRun characterRun, final Element inline )
{
final CharacterProperties clonedProperties = characterRun
.cloneProperties();
StringBuilder textDecorations = new StringBuilder();
setBorder(inline, clonedProperties.getBrc(), EMPTY);
setBorder( inline, clonedProperties.getBrc(), EMPTY );
if (characterRun.isCapitalized()) {
inline.setAttribute("text-transform", "uppercase");
if ( characterRun.isCapitalized() )
{
inline.setAttribute( "text-transform", "uppercase" );
}
if (characterRun.isHighlighted()) {
inline.setAttribute("background-color",
getColor(clonedProperties.getIcoHighlight()));
if ( characterRun.isHighlighted() )
{
inline.setAttribute( "background-color",
getColor( clonedProperties.getIcoHighlight() ) );
}
if (characterRun.isStrikeThrough()) {
if (textDecorations.length() > 0)
textDecorations.append(" ");
textDecorations.append("line-through");
if ( characterRun.isStrikeThrough() )
{
if ( textDecorations.length() > 0 )
textDecorations.append( " " );
textDecorations.append( "line-through" );
}
if (characterRun.isShadowed()) {
inline.setAttribute("text-shadow", characterRun.getFontSize() / 24
+ "pt");
if ( characterRun.isShadowed() )
{
inline.setAttribute( "text-shadow", characterRun.getFontSize() / 24
+ "pt" );
}
if (characterRun.isSmallCaps()) {
inline.setAttribute("font-variant", "small-caps");
if ( characterRun.isSmallCaps() )
{
inline.setAttribute( "font-variant", "small-caps" );
}
if (characterRun.getSubSuperScriptIndex() == 1) {
inline.setAttribute("baseline-shift", "super");
inline.setAttribute("font-size", "smaller");
if ( characterRun.getSubSuperScriptIndex() == 1 )
{
inline.setAttribute( "baseline-shift", "super" );
inline.setAttribute( "font-size", "smaller" );
}
if (characterRun.getSubSuperScriptIndex() == 2) {
inline.setAttribute("baseline-shift", "sub");
inline.setAttribute("font-size", "smaller");
if ( characterRun.getSubSuperScriptIndex() == 2 )
{
inline.setAttribute( "baseline-shift", "sub" );
inline.setAttribute( "font-size", "smaller" );
}
if (characterRun.getUnderlineCode() > 0) {
if (textDecorations.length() > 0)
textDecorations.append(" ");
textDecorations.append("underline");
if ( characterRun.getUnderlineCode() > 0 )
{
if ( textDecorations.length() > 0 )
textDecorations.append( " " );
textDecorations.append( "underline" );
}
if (characterRun.isVanished()) {
inline.setAttribute("visibility", "hidden");
if ( characterRun.isVanished() )
{
inline.setAttribute( "visibility", "hidden" );
}
if (textDecorations.length() > 0) {
inline.setAttribute("text-decoration", textDecorations.toString());
if ( textDecorations.length() > 0 )
{
inline.setAttribute( "text-decoration", textDecorations.toString() );
}
}
public static void setFontFamily(final Element element,
final String fontFamily) {
element.setAttribute("font-family", fontFamily);
public static void setFontFamily( final Element element,
final String fontFamily )
{
if ( isEmpty( fontFamily ) )
return;
element.setAttribute( "font-family", fontFamily );
}
public static void setFontSize(final Element element, final int fontSize) {
element.setAttribute("font-size", String.valueOf(fontSize));
public static void setFontSize( final Element element, final int fontSize )
{
element.setAttribute( "font-size", String.valueOf( fontSize ) );
}
public static void setIndent(Paragraph paragraph, Element block) {
if (paragraph.getFirstLineIndent() != 0) {
block.setAttribute(
"text-indent",
String.valueOf(paragraph.getFirstLineIndent()
/ TWIPS_PER_PT)
+ "pt");
}
if (paragraph.getIndentFromLeft() != 0) {
block.setAttribute(
"start-indent",
String.valueOf(paragraph.getIndentFromLeft() / TWIPS_PER_PT)
+ "pt");
}
if (paragraph.getIndentFromRight() != 0) {
block.setAttribute(
"end-indent",
String.valueOf(paragraph.getIndentFromRight()
/ TWIPS_PER_PT)
+ "pt");
}
if (paragraph.getSpacingBefore() != 0) {
block.setAttribute("space-before",
String.valueOf(paragraph.getSpacingBefore() / TWIPS_PER_PT)
+ "pt");
}
if (paragraph.getSpacingAfter() != 0) {
block.setAttribute("space-after",
String.valueOf(paragraph.getSpacingAfter() / TWIPS_PER_PT)
+ "pt");
}
public static void setIndent( Paragraph paragraph, Element block )
{
if ( paragraph.getFirstLineIndent() != 0 )
{
block.setAttribute(
"text-indent",
String.valueOf( paragraph.getFirstLineIndent()
/ TWIPS_PER_PT )
+ "pt" );
}
if ( paragraph.getIndentFromLeft() != 0 )
{
block.setAttribute(
"start-indent",
String.valueOf( paragraph.getIndentFromLeft()
/ TWIPS_PER_PT )
+ "pt" );
}
if ( paragraph.getIndentFromRight() != 0 )
{
block.setAttribute(
"end-indent",
String.valueOf( paragraph.getIndentFromRight()
/ TWIPS_PER_PT )
+ "pt" );
}
if ( paragraph.getSpacingBefore() != 0 )
{
block.setAttribute(
"space-before",
String.valueOf( paragraph.getSpacingBefore() / TWIPS_PER_PT )
+ "pt" );
}
if ( paragraph.getSpacingAfter() != 0 )
{
block.setAttribute( "space-after",
String.valueOf( paragraph.getSpacingAfter() / TWIPS_PER_PT )
+ "pt" );
}
}
public static void setItalic(final Element element, final boolean italic) {
element.setAttribute("font-style", italic ? "italic" : "normal");
public static void setItalic( final Element element, final boolean italic )
{
element.setAttribute( "font-style", italic ? "italic" : "normal" );
}
public static void setJustification(Paragraph paragraph,
final Element element) {
String justification = getJustification(paragraph.getJustification());
if (isNotEmpty(justification))
element.setAttribute("text-align", justification);
public static void setJustification( Paragraph paragraph,
final Element element )
{
String justification = getJustification( paragraph.getJustification() );
if ( isNotEmpty( justification ) )
element.setAttribute( "text-align", justification );
}
public static void setParagraphProperties(Paragraph paragraph, Element block) {
setIndent(paragraph, block);
setJustification(paragraph, block);
public static void setParagraphProperties( Paragraph paragraph,
Element block )
{
setIndent( paragraph, block );
setJustification( paragraph, block );
setBorder(block, paragraph.getBottomBorder(), "bottom");
setBorder(block, paragraph.getLeftBorder(), "left");
setBorder(block, paragraph.getRightBorder(), "right");
setBorder(block, paragraph.getTopBorder(), "top");
setBorder( block, paragraph.getBottomBorder(), "bottom" );
setBorder( block, paragraph.getLeftBorder(), "left" );
setBorder( block, paragraph.getRightBorder(), "right" );
setBorder( block, paragraph.getTopBorder(), "top" );
if (paragraph.pageBreakBefore()) {
block.setAttribute("break-before", "page");
}
if ( paragraph.pageBreakBefore() )
{
block.setAttribute( "break-before", "page" );
}
block.setAttribute("hyphenate",
String.valueOf(paragraph.isAutoHyphenated()));
block.setAttribute( "hyphenate",
String.valueOf( paragraph.isAutoHyphenated() ) );
if (paragraph.keepOnPage()) {
block.setAttribute("keep-together.within-page", "always");
}
if ( paragraph.keepOnPage() )
{
block.setAttribute( "keep-together.within-page", "always" );
}
if (paragraph.keepWithNext()) {
block.setAttribute("keep-with-next.within-page", "always");
}
if ( paragraph.keepWithNext() )
{
block.setAttribute( "keep-with-next.within-page", "always" );
}
block.setAttribute("linefeed-treatment", "preserve");
block.setAttribute("white-space-collapse", "false");
block.setAttribute( "linefeed-treatment", "preserve" );
block.setAttribute( "white-space-collapse", "false" );
}
public static void setPictureProperties(Picture picture,
Element graphicElement) {
public static void setPictureProperties( Picture picture,
Element graphicElement )
{
final int aspectRatioX = picture.getAspectRatioX();
final int aspectRatioY = picture.getAspectRatioY();
if (aspectRatioX > 0) {
graphicElement.setAttribute("content-width", ((picture.getDxaGoal()
* aspectRatioX / 100) / WordToFoUtils.TWIPS_PER_PT)
+ "pt");
} else
graphicElement.setAttribute("content-width",
(picture.getDxaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
if (aspectRatioY > 0)
if ( aspectRatioX > 0 )
{
graphicElement
.setAttribute("content-height", ((picture.getDyaGoal()
* aspectRatioY / 100) / WordToFoUtils.TWIPS_PER_PT)
+ "pt");
.setAttribute( "content-width", ( ( picture.getDxaGoal()
* aspectRatioX / 100 ) / TWIPS_PER_PT )
+ "pt" );
}
else
graphicElement.setAttribute("content-height",
(picture.getDyaGoal() / WordToFoUtils.TWIPS_PER_PT) + "pt");
graphicElement.setAttribute( "content-width",
( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
if (aspectRatioX <= 0 || aspectRatioY <= 0) {
graphicElement.setAttribute("scaling", "uniform");
} else {
graphicElement.setAttribute("scaling", "non-uniform");
if ( aspectRatioY > 0 )
graphicElement
.setAttribute( "content-height", ( ( picture.getDyaGoal()
* aspectRatioY / 100 ) / TWIPS_PER_PT )
+ "pt" );
else
graphicElement.setAttribute( "content-height",
( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
{
graphicElement.setAttribute( "scaling", "uniform" );
}
else
{
graphicElement.setAttribute( "scaling", "non-uniform" );
}
graphicElement.setAttribute("vertical-align", "text-bottom");
graphicElement.setAttribute( "vertical-align", "text-bottom" );
if (picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
|| picture.getDyaCropBottom() != 0
|| picture.getDxaCropLeft() != 0) {
int rectTop = picture.getDyaCropTop() / WordToFoUtils.TWIPS_PER_PT;
int rectRight = picture.getDxaCropRight()
/ WordToFoUtils.TWIPS_PER_PT;
int rectBottom = picture.getDyaCropBottom()
/ WordToFoUtils.TWIPS_PER_PT;
int rectLeft = picture.getDxaCropLeft()
/ WordToFoUtils.TWIPS_PER_PT;
graphicElement.setAttribute("clip", "rect(" + rectTop + "pt, "
|| picture.getDxaCropLeft() != 0 )
{
int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
+ rectRight + "pt, " + rectBottom + "pt, " + rectLeft
+ "pt)");
graphicElement.setAttribute("oveerflow", "hidden");
+ "pt)" );
graphicElement.setAttribute( "oveerflow", "hidden" );
}
}
public static void setTableCellProperties(TableRow tableRow,
TableCell tableCell, Element element, boolean toppest,
boolean bottomest, boolean leftest, boolean rightest) {
element.setAttribute("width", (tableCell.getWidth() / TWIPS_PER_INCH)
+ "in");
element.setAttribute("padding-start",
(tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
element.setAttribute("padding-end",
(tableRow.getGapHalf() / TWIPS_PER_INCH) + "in");
public static void setTableCellProperties( TableRow tableRow,
TableCell tableCell, Element element, boolean toppest,
boolean bottomest, boolean leftest, boolean rightest )
{
element.setAttribute( "width", ( tableCell.getWidth() / TWIPS_PER_INCH )
+ "in" );
element.setAttribute( "padding-start",
( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
element.setAttribute( "padding-end",
( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in" );
BorderCode top = tableCell.getBrcTop() != null ? tableCell.getBrcTop()
: toppest ? tableRow.getTopBorder() : tableRow
.getHorizontalBorder();
BorderCode bottom = tableCell.getBrcBottom() != null ? tableCell
.getBrcBottom() : bottomest ? tableRow.getBottomBorder()
: tableRow.getHorizontalBorder();
BorderCode top = tableCell.getBrcTop() != null
&& tableCell.getBrcTop().getBorderType() != 0 ? tableCell
.getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
.getHorizontalBorder();
BorderCode bottom = tableCell.getBrcBottom() != null
&& tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
.getBrcBottom() : bottomest ? tableRow.getBottomBorder()
: tableRow.getHorizontalBorder();
BorderCode left = tableCell.getBrcLeft() != null ? tableCell
.getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
.getVerticalBorder();
BorderCode right = tableCell.getBrcRight() != null ? tableCell
.getBrcRight() : rightest ? tableRow.getRightBorder()
: tableRow.getVerticalBorder();
BorderCode left = tableCell.getBrcLeft() != null
&& tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
.getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
.getVerticalBorder();
BorderCode right = tableCell.getBrcRight() != null
&& tableCell.getBrcRight().getBorderType() != 0 ? tableCell
.getBrcRight() : rightest ? tableRow.getRightBorder()
: tableRow.getVerticalBorder();
setBorder(element, bottom, "bottom");
setBorder(element, left, "left");
setBorder(element, right, "right");
setBorder(element, top, "top");
setBorder( element, bottom, "bottom" );
setBorder( element, left, "left" );
setBorder( element, right, "right" );
setBorder( element, top, "top" );
}
public static void setTableRowProperties(TableRow tableRow,
Element tableRowElement) {
if (tableRow.getRowHeight() > 0) {
tableRowElement.setAttribute("height",
(tableRow.getRowHeight() / TWIPS_PER_INCH) + "in");
}
if (!tableRow.cantSplit()) {
tableRowElement.setAttribute("keep-together", "always");
}
public static void setTableRowProperties( TableRow tableRow,
Element tableRowElement )
{
if ( tableRow.getRowHeight() > 0 )
{
tableRowElement.setAttribute( "height",
( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in" );
}
if ( !tableRow.cantSplit() )
{
tableRowElement.setAttribute( "keep-together", "always" );
}
}
}

View File

@ -0,0 +1,475 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import static org.apache.poi.hwpf.extractor.AbstractWordUtils.TWIPS_PER_INCH;
/**
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
public class WordToHtmlExtractor extends AbstractWordExtractor
{
/**
* Holds properties values, applied to current <tt>p</tt> element. Those
* properties shall not be doubled in children <tt>span</tt> elements.
*/
private static class BlockProperies
{
final String pFontName;
final int pFontSize;
public BlockProperies( String pFontName, int pFontSize )
{
this.pFontName = pFontName;
this.pFontSize = pFontSize;
}
}
private static final POILogger logger = POILogFactory
.getLogger( WordToHtmlExtractor.class );
private static String getSectionStyle( Section section )
{
SectionProperties sep = WordToHtmlUtils.getSectionProperties( section );
float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH;
float topMargin = sep.getDyaTop() / TWIPS_PER_INCH;
float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH;
String style = "margin: " + topMargin + "in " + rightMargin + "in "
+ bottomMargin + "in " + leftMargin + "in; ";
if ( sep.getCcolM1() > 0 )
{
style += "column-count: " + ( sep.getCcolM1() + 1 ) + "; ";
if ( sep.getFEvenlySpaced() )
{
style += "column-gap: "
+ ( sep.getDxaColumns() / TWIPS_PER_INCH ) + "in; ";
}
else
{
style += "column-gap: 0.25in; ";
}
}
return style;
}
/**
* Java main() interface to interact with WordToHtmlExtractor
*
* <p>
* Usage: WordToHtmlExtractor infile outfile
* </p>
* Where infile is an input .doc file ( Word 95-2007) which will be rendered
* as HTML into outfile
*/
public static void main( String[] args )
{
if ( args.length < 2 )
{
System.err
.println( "Usage: WordToHtmlExtractor <inputFile.doc> <saveTo.html>" );
return;
}
System.out.println( "Converting " + args[0] );
System.out.println( "Saving output to " + args[1] );
try
{
Document doc = WordToHtmlExtractor.process( new File( args[0] ) );
FileWriter out = new FileWriter( args[1] );
DOMSource domSource = new DOMSource( doc );
StreamResult streamResult = new StreamResult( out );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
serializer.setOutputProperty( OutputKeys.METHOD, "html" );
serializer.transform( domSource, streamResult );
out.close();
}
catch ( Exception e )
{
e.printStackTrace();
}
}
static Document process( File docFile ) throws Exception
{
final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );
WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToHtmlExtractor.processDocument( wordDocument );
return wordToHtmlExtractor.getDocument();
}
private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
private final HtmlDocumentFacade htmlDocumentFacade;
/**
* Creates new instance of {@link WordToHtmlExtractor}. Can be used for
* output several {@link HWPFDocument}s into single HTML document.
*
* @param document
* XML DOM Document used as HTML document
*/
public WordToHtmlExtractor( Document document )
{
this.htmlDocumentFacade = new HtmlDocumentFacade( document );
}
public Document getDocument()
{
return htmlDocumentFacade.getDocument();
}
@Override
protected void outputCharacters( Element pElement,
CharacterRun characterRun, String text )
{
Element span = htmlDocumentFacade.document.createElement( "span" );
pElement.appendChild( span );
StringBuilder style = new StringBuilder();
BlockProperies blockProperies = this.blocksProperies.peek();
if ( characterRun.getFontName() != null
&& !WordToHtmlUtils.equals( characterRun.getFontName(),
blockProperies.pFontName ) )
{
style.append( "font-family: " + characterRun.getFontName() + "; " );
}
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
{
style.append( "font-size: " + characterRun.getFontSize() / 2 + "; " );
}
WordToHtmlUtils.addCharactersProperties( characterRun, style );
if ( style.length() != 0 )
span.setAttribute( "style", style.toString() );
Text textNode = htmlDocumentFacade.createText( text );
span.appendChild( textNode );
}
protected void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String hyperlink, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( wordDocument, currentTableLevel, paragraph,
basicLink, characterRuns, beginTextInclusive,
endTextExclusive );
}
/**
* This method shall store image bytes in external file and convert it if
* necessary. Images shall be stored using PNG format. Other formats may be
* not supported by user browser.
* <p>
* Please note the
* {@link WordToHtmlUtils#setPictureProperties(Picture, Element)} method.
*
* @param currentBlock
* currently processed HTML element, like <tt>p</tt>. Shall be
* used as parent of newly created <tt>img</tt>
* @param inlined
* if image is inlined
* @param picture
* HWPF object, contained picture data and properties
*/
protected void processImage( Element currentBlock, boolean inlined,
Picture picture )
{
// no default implementation -- skip
currentBlock.appendChild( htmlDocumentFacade.document
.createComment( "Image link to '"
+ picture.suggestFullFileName() + "' can be here" ) );
}
protected void processPageref( HWPFDocumentCore hwpfDocument,
Element currentBlock, Paragraph paragraph,
List<CharacterRun> characterRuns, int currentTableLevel,
String pageref, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
basicLink, characterRuns, beginTextInclusive,
endTextExclusive );
}
protected void processParagraph( HWPFDocumentCore hwpfDocument,
Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText )
{
final Element pElement = htmlDocumentFacade.createParagraph();
parentFopElement.appendChild( pElement );
StringBuilder style = new StringBuilder();
WordToHtmlUtils.addParagraphProperties( paragraph, style );
final int charRuns = paragraph.numCharacterRuns();
if ( charRuns == 0 )
{
return;
}
{
final String pFontName;
final int pFontSize;
final CharacterRun characterRun = paragraph.getCharacterRun( 0 );
if ( characterRun != null )
{
pFontSize = characterRun.getFontSize() / 2;
pFontName = characterRun.getFontName();
WordToHtmlUtils.addFontFamily( pFontName, style );
WordToHtmlUtils.addFontSize( pFontSize, style );
}
else
{
pFontSize = -1;
pFontName = WordToHtmlUtils.EMPTY;
}
blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );
}
try
{
if ( WordToHtmlUtils.isNotEmpty( bulletText ) )
{
Text textNode = htmlDocumentFacade.createText( bulletText );
pElement.appendChild( textNode );
}
List<CharacterRun> characterRuns = WordToHtmlUtils
.findCharacterRuns( paragraph );
processCharacters( hwpfDocument, currentTableLevel, paragraph,
pElement, characterRuns, 0, characterRuns.size() );
}
finally
{
blocksProperies.pop();
}
if ( style.length() > 0 )
pElement.setAttribute( "style", style.toString() );
return;
}
protected void processSection( HWPFDocumentCore wordDocument,
Section section, int sectionCounter )
{
Element div = htmlDocumentFacade.document.createElement( "div" );
div.setAttribute( "style", getSectionStyle( section ) );
htmlDocumentFacade.body.appendChild( div );
processSectionParagraphes( wordDocument, div, section, 0 );
}
@Override
protected void processSingleSection( HWPFDocumentCore wordDocument,
Section section )
{
htmlDocumentFacade.body.setAttribute( "style",
getSectionStyle( section ) );
processSectionParagraphes( wordDocument, htmlDocumentFacade.body,
section, 0 );
}
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
Table table, int thisTableLevel )
{
Element tableHeader = htmlDocumentFacade.createTableHeader();
Element tableBody = htmlDocumentFacade.createTableBody();
final int tableRows = table.numRows();
int maxColumns = Integer.MIN_VALUE;
for ( int r = 0; r < tableRows; r++ )
{
maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
}
for ( int r = 0; r < tableRows; r++ )
{
TableRow tableRow = table.getRow( r );
Element tableRowElement = htmlDocumentFacade.createTableRow();
StringBuilder tableRowStyle = new StringBuilder();
WordToHtmlUtils.addTableRowProperties( tableRow, tableRowStyle );
final int rowCells = tableRow.numCells();
for ( int c = 0; c < rowCells; c++ )
{
TableCell tableCell = tableRow.getCell( c );
if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
continue;
if ( tableCell.isVerticallyMerged()
&& !tableCell.isFirstVerticallyMerged() )
continue;
Element tableCellElement;
if ( tableRow.isTableHeader() )
{
tableCellElement = htmlDocumentFacade
.createTableHeaderCell();
}
else
{
tableCellElement = htmlDocumentFacade.createTableCell();
}
StringBuilder tableCellStyle = new StringBuilder();
WordToHtmlUtils.addTableCellProperties( tableRow, tableCell,
r == 0, r == tableRows - 1, c == 0, c == rowCells - 1,
tableCellStyle );
if ( tableCell.isFirstMerged() )
{
int count = 0;
for ( int c1 = c; c1 < rowCells; c1++ )
{
TableCell nextCell = tableRow.getCell( c1 );
if ( nextCell.isMerged() )
count++;
if ( !nextCell.isMerged() )
break;
}
tableCellElement.setAttribute( "colspan", "" + count );
}
else
{
if ( c == rowCells - 1 && c != maxColumns - 1 )
{
tableCellElement.setAttribute( "colspan", ""
+ ( maxColumns - c ) );
}
}
if ( tableCell.isFirstVerticallyMerged() )
{
int count = 0;
for ( int r1 = r; r1 < tableRows; r1++ )
{
TableRow nextRow = table.getRow( r1 );
if ( nextRow.numCells() < c )
break;
TableCell nextCell = nextRow.getCell( c );
if ( nextCell.isVerticallyMerged() )
count++;
if ( !nextCell.isVerticallyMerged() )
break;
}
tableCellElement.setAttribute( "rowspan", "" + count );
}
processSectionParagraphes( hwpfDocument, tableCellElement,
tableCell, thisTableLevel );
if ( !tableCellElement.hasChildNodes() )
{
tableCellElement.appendChild( htmlDocumentFacade
.createParagraph() );
}
if ( tableCellStyle.length() > 0 )
tableCellElement.setAttribute( "style",
tableCellStyle.toString() );
tableRowElement.appendChild( tableCellElement );
}
if ( tableRowStyle.length() > 0 )
tableRowElement
.setAttribute( "style", tableRowStyle.toString() );
if ( tableRow.isTableHeader() )
{
tableHeader.appendChild( tableRowElement );
}
else
{
tableBody.appendChild( tableRowElement );
}
}
final Element tableElement = htmlDocumentFacade.createTable();
if ( tableHeader.hasChildNodes() )
{
tableElement.appendChild( tableHeader );
}
if ( tableBody.hasChildNodes() )
{
tableElement.appendChild( tableBody );
flow.appendChild( tableElement );
}
else
{
logger.log(
POILogger.WARN,
"Table without body starting on offset "
+ table.getStartOffset() + " -- "
+ table.getEndOffset() );
}
}
}

View File

@ -0,0 +1,292 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterProperties;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.w3c.dom.Element;
public class WordToHtmlUtils extends AbstractWordUtils
{
public static void addBold( final boolean bold, StringBuilder style )
{
style.append( "font-weight: " + ( bold ? "bold" : "normal" ) + ";" );
}
public static void addBorder( BorderCode borderCode, String where,
StringBuilder style )
{
if ( borderCode == null || borderCode.getBorderType() == 0 )
return;
if ( isEmpty( where ) )
{
style.append( "border-style: " + getBorderType( borderCode ) + "; " );
style.append( "border-color: " + getColor( borderCode.getColor() )
+ "; " );
style.append( "border-width: " + getBorderWidth( borderCode )
+ "; " );
}
else
{
style.append( "border-" + where + "-style: "
+ getBorderType( borderCode ) + "; " );
style.append( "border-" + where + "-color: "
+ getColor( borderCode.getColor() ) + "; " );
style.append( "border-" + where + "-width: "
+ getBorderWidth( borderCode ) + "; " );
}
}
public static void addCharactersProperties(
final CharacterRun characterRun, StringBuilder style )
{
final CharacterProperties clonedProperties = characterRun
.cloneProperties();
if ( characterRun.isBold() )
{
style.append( "font-weight: bold; " );
}
if ( characterRun.isItalic() )
{
style.append( "font-style: italic; " );
}
addBorder( clonedProperties.getBrc(), EMPTY, style );
if ( characterRun.isCapitalized() )
{
style.append( "text-transform: uppercase; " );
}
if ( characterRun.isHighlighted() )
{
style.append( "background-color: "
+ getColor( clonedProperties.getIcoHighlight() ) + "; " );
}
if ( characterRun.isStrikeThrough() )
{
style.append( "text-decoration: line-through; " );
}
if ( characterRun.isShadowed() )
{
style.append( "text-shadow: " + characterRun.getFontSize() / 24
+ "pt; " );
}
if ( characterRun.isSmallCaps() )
{
style.append( "font-variant: small-caps; " );
}
if ( characterRun.getSubSuperScriptIndex() == 1 )
{
style.append( "baseline-shift: super; " );
style.append( "font-size: smaller; " );
}
if ( characterRun.getSubSuperScriptIndex() == 2 )
{
style.append( "baseline-shift: sub; " );
style.append( "font-size: smaller; " );
}
if ( characterRun.getUnderlineCode() > 0 )
{
style.append( "text-decoration: underline; " );
}
if ( characterRun.isVanished() )
{
style.append( "visibility: hidden; " );
}
}
public static void addFontFamily( final String fontFamily,
StringBuilder style )
{
if ( isEmpty( fontFamily ) )
return;
style.append( "font-family: " + fontFamily );
}
public static void addFontSize( final int fontSize, StringBuilder style )
{
style.append( "font-size: " + fontSize );
}
public static void addIndent( Paragraph paragraph, StringBuilder style )
{
addIndent( style, "text-indent", paragraph.getFirstLineIndent() );
addIndent( style, "start-indent", paragraph.getIndentFromLeft() );
addIndent( style, "end-indent", paragraph.getIndentFromRight() );
addIndent( style, "space-before", paragraph.getSpacingBefore() );
addIndent( style, "space-after", paragraph.getSpacingAfter() );
}
private static void addIndent( StringBuilder style, final String cssName,
final int twipsValue )
{
if ( twipsValue == 0 )
return;
style.append( cssName + ": " + ( twipsValue / TWIPS_PER_PT ) + "pt; " );
}
public static void addJustification( Paragraph paragraph,
final StringBuilder style )
{
String justification = getJustification( paragraph.getJustification() );
if ( isNotEmpty( justification ) )
style.append( "text-align: " + justification + "; " );
}
public static void addParagraphProperties( Paragraph paragraph,
StringBuilder style )
{
addIndent( paragraph, style );
addJustification( paragraph, style );
addBorder( paragraph.getBottomBorder(), "bottom", style );
addBorder( paragraph.getLeftBorder(), "left", style );
addBorder( paragraph.getRightBorder(), "right", style );
addBorder( paragraph.getTopBorder(), "top", style );
if ( paragraph.pageBreakBefore() )
{
style.append( "break-before: page; " );
}
style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " );
if ( paragraph.keepOnPage() )
{
style.append( "keep-together.within-page: always; " );
}
if ( paragraph.keepWithNext() )
{
style.append( "keep-with-next.within-page: always; " );
}
style.append( "linefeed-treatment: preserve; " );
style.append( "white-space-collapse: false; " );
}
public static void addTableCellProperties( TableRow tableRow,
TableCell tableCell, boolean toppest, boolean bottomest,
boolean leftest, boolean rightest, StringBuilder style )
{
style.append( "width: " + ( tableCell.getWidth() / TWIPS_PER_INCH )
+ "in; " );
style.append( "padding-start: "
+ ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
style.append( "padding-end: "
+ ( tableRow.getGapHalf() / TWIPS_PER_INCH ) + "in; " );
BorderCode top = tableCell.getBrcTop() != null
&& tableCell.getBrcTop().getBorderType() != 0 ? tableCell
.getBrcTop() : toppest ? tableRow.getTopBorder() : tableRow
.getHorizontalBorder();
BorderCode bottom = tableCell.getBrcBottom() != null
&& tableCell.getBrcBottom().getBorderType() != 0 ? tableCell
.getBrcBottom() : bottomest ? tableRow.getBottomBorder()
: tableRow.getHorizontalBorder();
BorderCode left = tableCell.getBrcLeft() != null
&& tableCell.getBrcLeft().getBorderType() != 0 ? tableCell
.getBrcLeft() : leftest ? tableRow.getLeftBorder() : tableRow
.getVerticalBorder();
BorderCode right = tableCell.getBrcRight() != null
&& tableCell.getBrcRight().getBorderType() != 0 ? tableCell
.getBrcRight() : rightest ? tableRow.getRightBorder()
: tableRow.getVerticalBorder();
addBorder( bottom, "bottom", style );
addBorder( left, "left", style );
addBorder( right, "right", style );
addBorder( top, "top", style );
}
public static void addTableRowProperties( TableRow tableRow,
StringBuilder style )
{
if ( tableRow.getRowHeight() > 0 )
{
style.append( "height: "
+ ( tableRow.getRowHeight() / TWIPS_PER_INCH ) + "in; " );
}
if ( !tableRow.cantSplit() )
{
style.append( "keep-together: always; " );
}
}
public static void setPictureProperties( Picture picture,
Element graphicElement )
{
final int aspectRatioX = picture.getAspectRatioX();
final int aspectRatioY = picture.getAspectRatioY();
if ( aspectRatioX > 0 )
{
graphicElement
.setAttribute( "content-width", ( ( picture.getDxaGoal()
* aspectRatioX / 100 ) / TWIPS_PER_PT )
+ "pt" );
}
else
graphicElement.setAttribute( "content-width",
( picture.getDxaGoal() / TWIPS_PER_PT ) + "pt" );
if ( aspectRatioY > 0 )
graphicElement
.setAttribute( "content-height", ( ( picture.getDyaGoal()
* aspectRatioY / 100 ) / TWIPS_PER_PT )
+ "pt" );
else
graphicElement.setAttribute( "content-height",
( picture.getDyaGoal() / TWIPS_PER_PT ) + "pt" );
if ( aspectRatioX <= 0 || aspectRatioY <= 0 )
{
graphicElement.setAttribute( "scaling", "uniform" );
}
else
{
graphicElement.setAttribute( "scaling", "non-uniform" );
}
graphicElement.setAttribute( "vertical-align", "text-bottom" );
if ( picture.getDyaCropTop() != 0 || picture.getDxaCropRight() != 0
|| picture.getDyaCropBottom() != 0
|| picture.getDxaCropLeft() != 0 )
{
int rectTop = picture.getDyaCropTop() / TWIPS_PER_PT;
int rectRight = picture.getDxaCropRight() / TWIPS_PER_PT;
int rectBottom = picture.getDyaCropBottom() / TWIPS_PER_PT;
int rectLeft = picture.getDxaCropLeft() / TWIPS_PER_PT;
graphicElement.setAttribute( "clip", "rect(" + rectTop + "pt, "
+ rectRight + "pt, " + rectBottom + "pt, " + rectLeft
+ "pt)" );
graphicElement.setAttribute( "oveerflow", "hidden" );
}
}
}

View File

@ -0,0 +1,114 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.File;
import java.io.FilenameFilter;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocumentCore;
public class TestWordToExtractorSuite
{
/**
* YK: a quick hack to exclude failing documents from the suite.
*/
private static List<String> failingFiles = Arrays.asList();
public static Test suite()
{
TestSuite suite = new TestSuite();
File directory = POIDataSamples.getDocumentInstance().getFile(
"../document" );
for ( final File child : directory.listFiles( new FilenameFilter()
{
public boolean accept( File dir, String name )
{
return name.endsWith( ".doc" ) && !failingFiles.contains( name );
}
} ) )
{
final String name = child.getName();
suite.addTest( new TestCase( name + " [FO]" )
{
public void runTest() throws Exception
{
test( child, false );
}
} );
suite.addTest( new TestCase( name + " [HTML]" )
{
public void runTest() throws Exception
{
test( child, true );
}
} );
}
return suite;
}
protected static void test( File child, boolean html ) throws Exception
{
HWPFDocumentCore hwpfDocument;
try
{
hwpfDocument = AbstractWordUtils.loadDoc( child );
}
catch ( Exception exc )
{
// unable to parse file -- not WordToFoExtractor fault
return;
}
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToFoExtractor.processDocument( hwpfDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
transformer.transform(
new DOMSource( wordToFoExtractor.getDocument() ),
new StreamResult( stringWriter ) );
if ( html )
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
// no exceptions
}
}

View File

@ -1,92 +0,0 @@
package org.apache.poi.hwpf.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.hwpf.OldWordFileFormatException;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument;
public class TestWordToFoExtractorSuite
{
/**
* YK: a quick hack to exclude failing documents from the suite.
*
* WordToFoExtractor stumbles on Bug33519.doc with a NPE
*/
private static List<String> failingFiles = Arrays.asList("Bug33519.doc");
public static Test suite() {
TestSuite suite = new TestSuite();
File directory = POIDataSamples.getDocumentInstance().getFile(
"../document");
for (final File child : directory.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".doc") && !failingFiles.contains(name);
}
})) {
final String name = child.getName();
suite.addTest(new TestCase(name) {
public void runTest() throws Exception {
test(child);
}
});
}
return suite;
}
protected static void test( File child ) throws Exception
{
HWPFDocument hwpfDocument;
FileInputStream fileInputStream = new FileInputStream( child );
try
{
hwpfDocument = new HWPFDocument( fileInputStream );
}
catch ( Exception exc )
{
// unable to parse file -- not WordToFoExtractor fault
return;
}
finally
{
fileInputStream.close();
}
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToFoExtractor.processDocument( hwpfDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
transformer.transform(
new DOMSource( wordToFoExtractor.getDocument() ),
new StreamResult( stringWriter ) );
// no exceptions
}
}

View File

@ -0,0 +1,95 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.extractor;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import junit.framework.TestCase;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument;
/**
* Test cases for {@link WordToFoExtractor}
*
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
public class TestWordToHtmlExtractor extends TestCase
{
private static String getHtmlText( final String sampleFileName )
throws Exception
{
HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples
.getDocumentInstance().openResourceAsStream( sampleFileName ) );
WordToHtmlExtractor wordToHtmlExtractor = new WordToHtmlExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToHtmlExtractor.processDocument( hwpfDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
transformer.transform(
new DOMSource( wordToHtmlExtractor.getDocument() ),
new StreamResult( stringWriter ) );
String result = stringWriter.toString();
return result;
}
public void testBug46610_2() throws Exception
{
String result = getHtmlText( "Bug46610_2.doc" );
assertTrue( result
.contains( "012345678911234567892123456789312345678941234567890123456789112345678921234567893123456789412345678" ) );
}
public void testEquation() throws Exception
{
String result = getHtmlText( "equation.doc" );
assertTrue( result
.contains( "<!--Image link to '0.emf' can be here-->" ) );
}
public void testHyperlink() throws Exception
{
String result = getHtmlText( "hyperlink.doc" );
assertTrue( result.contains( "<a href=\"http://testuri.org/\">" ) );
assertTrue( result.contains( "Hyperlink text" ) );
}
public void testPageref() throws Exception
{
String result = getHtmlText( "pageref.doc" );
assertTrue( result.contains( "<a href=\"#userref\">" ) );
assertTrue( result.contains( "1" ) );
}
}