diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index a7090d419..b8a9892f9 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -214,15 +214,10 @@ public final class HWPFDocument extends HWPFDocumentCore _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); _tpt = _cft.getTextPieceTable(); - // Word XP and later all put in a zero filled buffer in - // front of the text. This screws up the system for offsets, - // which assume we always start at zero. This is an adjustment. - int cpMin = _tpt.getCpMin(); - // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt, true); - _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), cpMin, _tpt, true); + _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt, true); // Read FSPA and Escher information _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java index 6a57704a9..186f037a5 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/HtmlDocumentFacade.java @@ -43,6 +43,8 @@ public class HtmlDocumentFacade html.appendChild( head ); html.appendChild( body ); + + body.setAttribute( "style", "white-space-collapsing: preserve; " ); } public void addAuthor( String value ) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java index a1297015a..80e13a453 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlUtils.java @@ -172,7 +172,8 @@ public class WordToHtmlUtils extends AbstractWordUtils style.append( "break-before: page; " ); } - style.append( "hyphenate: " + paragraph.isAutoHyphenated() + "; " ); + style.append( "hyphenate: " + + ( paragraph.isAutoHyphenated() ? "auto" : "none" ) + "; " ); if ( paragraph.keepOnPage() ) { @@ -183,9 +184,6 @@ public class WordToHtmlUtils extends AbstractWordUtils { style.append( "keep-with-next.within-page: always; " ); } - - style.append( "linefeed-treatment: preserve; " ); - style.append( "white-space-collapse: false; " ); } public static void addTableCellProperties( TableRow tableRow, diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java index ed5dae220..714f0e1c1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java @@ -21,6 +21,8 @@ import java.util.ArrayList; import java.util.List; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Represents a CHP fkp. The style properties for paragraph and character runs @@ -40,6 +42,9 @@ import org.apache.poi.util.LittleEndian; */ public final class CHPFormattedDiskPage extends FormattedDiskPage { + private static final POILogger logger = POILogFactory + .getLogger( CHPFormattedDiskPage.class ); + private static final int FC_SIZE = 4; private ArrayList _chpxList = new ArrayList(); @@ -79,11 +84,20 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage int startAt = getStart(x); int endAt = getEnd(x); - if (ignoreChpxWithoutTextPieces && !tpt.isIndexInTable( startAt, endAt ) ) { - _chpxList.add(null); - } else { + if (!ignoreChpxWithoutTextPieces || tpt.isIndexInTable( startAt, endAt ) ) + { _chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x))); } + else + { + logger.log( POILogger.WARN, "CHPX [", + Integer.valueOf( startAt ), "; ", + Integer.valueOf( endAt ), + ") (bytes) doesn't have corresponding text pieces " + + "and will be skipped" ); + + _chpxList.add(null); + } } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java index 3f43c2f1b..479bb3c95 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -21,8 +21,6 @@ import java.util.Collections; import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.util.LittleEndian; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; /** * This class holds all of the character formatting @@ -34,9 +32,6 @@ import org.apache.poi.util.POILogger; */ public final class OldCHPBinTable extends CHPBinTable { - private static final POILogger logger = POILogFactory - .getLogger( OldCHPBinTable.class ); - /** * Constructor used to read an old-style binTable * in from a Word document. @@ -67,15 +62,8 @@ public final class OldCHPBinTable extends CHPBinTable for (int y = 0; y < fkpSize; y++) { CHPX chpx = cfkp.getCHPX(y); - if (chpx != null && tpt.isIndexInTable( chpx.getStartBytes(), chpx.getEndBytes() )) { + if (chpx != null) _textRuns.add(chpx); - } else { - if ( chpx != null ) - logger.log( POILogger.WARN, "CHPX [", - chpx.getStartBytes(), "; ", chpx.getEndBytes(), - ") (bytes) doesn't have corresponding text pieces " - + "and will be skipped" ); - } } } Collections.sort( _textRuns, PropertyNode.StartComparator.instance ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java index 3cc096455..9f5a43fe6 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java @@ -21,8 +21,6 @@ import java.util.Collections; import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.util.LittleEndian; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; /** * This class holds all of the paragraph formatting @@ -34,8 +32,6 @@ import org.apache.poi.util.POILogger; */ public final class OldPAPBinTable extends PAPBinTable { - private static final POILogger logger = POILogFactory - .getLogger( OldPAPBinTable.class ); public OldPAPBinTable(byte[] documentStream, int offset, int size, int fcMin, TextPieceTable tpt) @@ -51,21 +47,15 @@ public final class OldPAPBinTable extends PAPBinTable int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, - documentStream, pageOffset, fcMin, tpt); + documentStream, pageOffset, tpt, true); int fkpSize = pfkp.size(); for (int y = 0; y < fkpSize; y++) { PAPX papx = pfkp.getPAPX(y); - if (papx != null && tpt.isIndexInTable( papx.getStartBytes(), papx.getEndBytes() )) { + if (papx != null) { _paragraphs.add(papx); - } else { - if ( papx != null ) - logger.log( POILogger.WARN, "PAPX [", - papx.getStartBytes(), "; ", papx.getEndBytes(), - ") (bytes) doesn't have corresponding text pieces " - + "and will be skipped" ); } } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 8dc28a45a..db55fea41 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -56,13 +56,12 @@ public class PAPBinTable byte[] dataStream, int offset, int size, int fcMin, TextPieceTable tpt ) { - this( documentStream, tableStream, dataStream, offset, size, fcMin, - tpt, true ); + this( documentStream, tableStream, dataStream, offset, size, tpt, true ); } public PAPBinTable( byte[] documentStream, byte[] tableStream, - byte[] dataStream, int offset, int size, int fcMin, - TextPieceTable tpt, boolean ignorePapxWithoutTextPieces ) + byte[] dataStream, int offset, int size, TextPieceTable tpt, + boolean ignorePapxWithoutTextPieces ) { PlexOfCps binTable = new PlexOfCps(tableStream, offset, size, 4); this.tpt = tpt; @@ -76,7 +75,7 @@ public class PAPBinTable int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, - dataStream, pageOffset, fcMin, tpt); + dataStream, pageOffset, tpt, ignorePapxWithoutTextPieces); int fkpSize = pfkp.size(); @@ -84,8 +83,7 @@ public class PAPBinTable { PAPX papx = pfkp.getPAPX(y); - //we don't need PAPX if they are references nowhere - if (!ignorePapxWithoutTextPieces || tpt.isIndexInTable( papx.getStartBytes(), papx.getEndBytes() )) + if (papx != null) _paragraphs.add(papx); } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java index 14c677db4..1035698b8 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java @@ -22,6 +22,8 @@ import java.util.Arrays; import java.util.List; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * Represents a PAP FKP. The style properties for paragraph and character runs @@ -40,6 +42,8 @@ import org.apache.poi.util.LittleEndian; * @author Ryan Ackley */ public final class PAPFormattedDiskPage extends FormattedDiskPage { + private static final POILogger logger = POILogFactory + .getLogger( PAPFormattedDiskPage.class ); private static final int BX_SIZE = 13; private static final int FC_SIZE = 4; @@ -56,17 +60,45 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage { /** * Creates a PAPFormattedDiskPage from a 512 byte array + * + * @deprecated Use + * {@link #PAPFormattedDiskPage(byte[],byte[],int,int,TextPieceTable,boolean)} + * instead */ - public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt) + public PAPFormattedDiskPage( byte[] documentStream, byte[] dataStream, + int offset, int fcMin, TextPieceTable tpt ) { - super(documentStream, offset); - for (int x = 0; x < _crun; x++) { - int startAt = getStart(x); - int endAt = getEnd(x); - _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream)); - } - _fkp = null; - _dataStream = dataStream; + this( documentStream, dataStream, offset, tpt, true ); + } + + /** + * Creates a PAPFormattedDiskPage from a 512 byte array + */ + public PAPFormattedDiskPage( byte[] documentStream, byte[] dataStream, + int offset, TextPieceTable tpt, boolean ignorePapxWithoutTextPieces ) + { + super( documentStream, offset ); + for ( int x = 0; x < _crun; x++ ) + { + int startAt = getStart( x ); + int endAt = getEnd( x ); + if ( !ignorePapxWithoutTextPieces + || tpt.isIndexInTable( startAt, endAt ) ) + _papxList.add( new PAPX( startAt, endAt, tpt, getGrpprl( x ), + getParagraphHeight( x ), dataStream ) ); + else + { + logger.log( POILogger.WARN, "PAPX [", + Integer.valueOf( startAt ), "; ", + Integer.valueOf( endAt ), + ") (bytes) doesn't have corresponding text pieces " + + "and will be skipped" ); + + _papxList.add( null ); + } + } + _fkp = null; + _dataStream = dataStream; } /** diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java index 6c8661431..ab9daa51f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToHtmlConverter.java @@ -26,6 +26,7 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import junit.framework.TestCase; + import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocument; @@ -68,6 +69,13 @@ public class TestWordToHtmlConverter extends TestCase assertTrue( result.substring( 0, 2000 ).contains( "" ) ); } + public void testBug33519() throws Exception + { + String result = getHtmlText( "Bug33519.doc" ); + assertTrue( result.contains( "Планински турове" ) ); + assertTrue( result.contains( "Явор Асенов" ) ); + } + public void testBug46610_2() throws Exception { String result = getHtmlText( "Bug46610_2.doc" ); diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java index 20b268033..8bd102a16 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java @@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream; import java.util.ArrayList; import junit.framework.TestCase; + import org.apache.poi.hwpf.HWPFDocFixture; import org.apache.poi.hwpf.model.io.HWPFFileSystem; @@ -38,9 +39,8 @@ public final class TestPAPBinTable FileInformationBlock fib = _hWPFDocFixture._fib; byte[] mainStream = _hWPFDocFixture._mainStream; byte[] tableStream = _hWPFDocFixture._tableStream; - int fcMin = fib.getFcMin(); - _pAPBinTable = new PAPBinTable(mainStream, tableStream, null, fib.getFcPlcfbtePapx(), fib.getLcbPlcfbtePapx(), fcMin, fakeTPT, false); + _pAPBinTable = new PAPBinTable(mainStream, tableStream, null, fib.getFcPlcfbtePapx(), fib.getLcbPlcfbtePapx(), fakeTPT, false); HWPFFileSystem fileSys = new HWPFFileSystem(); @@ -51,7 +51,7 @@ public final class TestPAPBinTable byte[] newTableStream = tableOut.toByteArray(); byte[] newMainStream = mainOut.toByteArray(); - PAPBinTable newBinTable = new PAPBinTable(newMainStream, newTableStream, null,0, newTableStream.length, 0, fakeTPT, false); + PAPBinTable newBinTable = new PAPBinTable(newMainStream, newTableStream, null,0, newTableStream.length, fakeTPT, false); ArrayList oldTextRuns = _pAPBinTable.getParagraphs(); ArrayList newTextRuns = newBinTable.getParagraphs();