improved HWPF to better handle unicode, patch provided by Benjamin Engele and Maxim Valyanskiy, see Bugzilla #46610

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@786505 13f79535-47bb-0310-9956-ffa450edef68
2009-06-19 13:45:55 +00:00 · 2009-06-19 13:45:55 +00:00 · 6ce1ef7d06
commit 6ce1ef7d06
parent 03b8a9686a
16 changed files with 187 additions and 87 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -33,6 +33,7 @@
    <changes>
        <release version="3.5-beta7" date="2009-??-??">
           <action dev="POI-DEVELOPERS" type="fix">46610 - Improved HWPF to better handle unicode</action>
           <action dev="POI-DEVELOPERS" type="fix">47261 - Fixed SlideShow#removeSlide to remove references to Notes</action>
           <action dev="POI-DEVELOPERS" type="fix">47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links</action>
           <action dev="POI-DEVELOPERS" type="fix">47384 - Fixed ExternalNameRecord to handle unicode names</action>
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/BytePropertyNode.java
@ -25,37 +25,28 @@ package org.apache.poi.hwpf.model;
 *  and characters.
 */
 public abstract class BytePropertyNode extends PropertyNode {
-	private boolean isUnicode;
+        private final int startBytes;
        private final int endBytes;
 	/**
 	 * @param fcStart The start of the text for this property, in _bytes_
 	 * @param fcEnd The end of the text for this property, in _bytes_
 	 */
-	public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) {
+	public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) {
 		super(
-				generateCp(fcStart, isUnicode),
+				translator.getCharIndex(fcStart),
-				generateCp(fcEnd, isUnicode),
+				translator.getCharIndex(fcEnd),
 				buf
 		);
-		this.isUnicode = isUnicode;
+                this.startBytes = fcStart;
-	}
+                this.endBytes = fcEnd;
 	private static int generateCp(int val, boolean isUnicode) {
 		if(isUnicode)
 			return val/2;
 		return val;
 	}
 	public boolean isUnicode() {
 		return isUnicode;
 	}
 	public int getStartBytes() {
-		if(isUnicode)
+                return startBytes;
 			return getStart()*2;
 		return getStart();
 	}
 	public int getEndBytes() {
-		if(isUnicode)
+                return endBytes;
 			return getEnd()*2;
 		return getEnd();
 	}
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@ -119,9 +119,8 @@ public final class CHPBinTable
  public void insert(int listIndex, int cpStart, SprmBuffer buf)
  {
 	boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
-    CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode);
+    CHPX insertChpx = new CHPX(0, 0, tpt,buf);
    // Ensure character offsets are really characters
    insertChpx.setStart(cpStart);
@ -141,7 +140,7 @@ public final class CHPBinTable
    	//  Original, until insert at point
    	//  New one
    	//  Clone of original, on to the old end
-        CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode);
+        CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf());
        // Again ensure contains character based offsets no matter what
        clone.setStart(cpStart);
        clone.setEnd(chpx.getEnd());
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
@ -60,8 +60,9 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
      for (int x = 0; x < _crun; x++)
      {
-    	boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) );
+    	int startAt = getStart(x);
-        _chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode));
+		int endAt = getEnd(x);
 		_chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x)));
      }
    }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
@ -34,14 +34,14 @@ import org.apache.poi.hwpf.sprm.CharacterSprmUncompressor;
 public final class CHPX extends BytePropertyNode
 {
-  public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode)
+  public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl)
  {
-    super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode);
+    super(fcStart, fcEnd, translator, new SprmBuffer(grpprl));
  }
-  public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode)
+  public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf)
  {
-    super(fcStart, fcEnd, buf, isUnicode);
+    super(fcStart, fcEnd, translator ,buf);
  }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
@ -0,0 +1,40 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.model;
 public interface CharIndexTranslator {
    /**
     * Calculates the char index of the given byte index.
     *
     * @param bytePos The character offset to check 
     * @return the char index
     */
    int getCharIndex(int bytePos);
    /**
     * Is the text at the given byte offset unicode, or plain old ascii? In a
     * very evil fashion, you have to actually know this to make sense of
     * character and paragraph properties :(
     *
     * @param bytePos The character offset to check about
     * @return true if the text at the given byte offset is unicode
     */
    boolean isUnicodeAtByteOffset(int bytePos);
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@ -76,9 +76,8 @@ public final class PAPBinTable
  public void insert(int listIndex, int cpStart, SprmBuffer buf)
  {
    boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
-    PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode);
+    PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
    // Ensure character offsets are really characters
    forInsert.setStart(cpStart);
@ -108,7 +107,7 @@ public final class PAPBinTable
    	//  Original, until insert at point
    	//  New one
    	//  Clone of original, on to the old end
-        PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode);
+        PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
        // Again ensure contains character based offsets no matter what
        clone.setStart(cpStart);
        clone.setEnd(currentPap.getEnd());
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
@ -62,14 +62,10 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage
    public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt)
    {
      super(documentStream, offset);
      for (int x = 0; x < _crun; x++) {
-         int startAt = getStart(x) - fcMin;
+         int startAt = getStart(x);
-         int endAt = getEnd(x) - fcMin;
+         int endAt = getEnd(x);
-    	 boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt);
+         _papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream));
         //System.err.println(startAt + " -> " + endAt + " = " + isUnicode);
         _papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode));
      }
      _fkp = null;
      _dataStream = dataStream;
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
@ -40,18 +40,18 @@ public final class PAPX extends BytePropertyNode {
  private ParagraphHeight _phe;
  private int _hugeGrpprlOffset = -1;
-  public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode)
+  public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream)
  {
-    super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode);
+    super(fcStart, fcEnd, translator, new SprmBuffer(papx));
    _phe = phe;
    SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream);
    if(buf != null)
      _buf = buf;
  }
-  public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode)
+  public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream)
  {
-    super(fcStart, fcEnd, buf, isUnicode);
+    super(fcStart, fcEnd, translator, buf);
    _phe = new ParagraphHeight();
    buf = findHuge(buf, dataStream);
    if(buf != null)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SEPX.java
@ -28,9 +28,9 @@ public final class SEPX extends BytePropertyNode
  SectionDescriptor _sed;
-  public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode)
+  public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl)
  {
-    super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode);
+    super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0));
    _sed = sed;
  }
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
@ -61,13 +61,10 @@ public final class SectionTable
      int startAt = CPtoFC(node.getStart());
      int endAt = CPtoFC(node.getEnd());
      boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt );
 //      System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart);
      // check for the optimization
      if (fileOffset == 0xffffffff)
      {
-        _sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart));
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
      }
      else
      {
@ -76,7 +73,7 @@ public final class SectionTable
        byte[] buf = new byte[sepxSize];
        fileOffset += LittleEndian.SHORT_SIZE;
        System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
-        _sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart));
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
      }
    }
@ -138,33 +135,13 @@ public final class SectionTable
      }
      int FC = TP.getPieceDescriptor().getFilePosition();
      int offset = CP - TP.getCP();
-      FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition();
+      if (TP.isUnicode()) {
        offset = offset*2;
      }
      FC = FC+offset;
      return FC;
    }
    // Ryans code
    private int FCtoCP(int fc)
   {
     int size = _text.size();
     int cp = 0;
     for (int x = 0; x < size; x++)
     {
       TextPiece piece = (TextPiece)_text.get(x);
       if (fc <= piece.getEnd())
       {
         cp += (fc - piece.getStart());
         break;
       }
       else
       {
         cp += (piece.getEnd() - piece.getStart());
       }
     }
     return cp;
   }
  public ArrayList getSections()
  {
    return _sections;
@ -205,7 +182,7 @@ public final class SectionTable
      // Line using Ryan's FCtoCP() conversion method -
      // unable to observe any effect on our testcases when using this code - piers
-      GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray());
+      GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray());
      plex.addProperty(property);
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@ -37,7 +37,7 @@ import java.util.List;
 *  convertion.
 * @author Ryan Ackley
 */
-public final class TextPieceTable
+public final class TextPieceTable implements CharIndexTranslator
 {
  protected ArrayList _textPieces = new ArrayList();
  //int _multiple;
@ -150,31 +150,25 @@ public final class TextPieceTable
 	  // If they ask off the end, just go with the last one...
 	  return lastWas;
  }
-  /**
+
   * Is the text at the given byte offset
   *  unicode, or plain old ascii?
   * In a very evil fashion, you have to actually
   *  know this to make sense of character and
   *  paragraph properties :(
   * @param bytePos The character offset to check about
   */
  public boolean isUnicodeAtByteOffset(int bytePos) {
 	  boolean lastWas = false;
-	  int curByte = 0;
+	 
 	  Iterator it = _textPieces.iterator();
 	  while(it.hasNext()) {
 		  TextPiece tp = (TextPiece)it.next();
-		  int nextByte = curByte + tp.bytesLength();
+		  int curByte = tp.getPieceDescriptor().getFilePosition();
 		  int pieceEnd = curByte + tp.bytesLength();
 		  // If the text piece covers the character, all good
-		  if(curByte <= bytePos && nextByte >= bytePos) {
+		  if(curByte <= bytePos && pieceEnd > bytePos) {
 			  return tp.isUnicode();
 		  }
 		  // Otherwise keep track for the last one
 		  lastWas = tp.isUnicode();
 		  // Move along
-		  curByte = nextByte;
+		  curByte = pieceEnd;
 	  }
 	  // If they ask off the end, just go with the last one...
@ -268,4 +262,34 @@ public final class TextPieceTable
    }
    return false;
  }
  	/* (non-Javadoc)
 	 * @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int)
 	 */
 	public int getCharIndex(int bytePos) {
 		int charCount = 0;
 		Iterator it = _textPieces.iterator();
 		while (it.hasNext()) {
 			TextPiece tp = (TextPiece) it.next();
 			int pieceStart = tp.getPieceDescriptor().getFilePosition();
 			if(pieceStart >= bytePos) {
 				break;
 			}
 			int bytesLength = tp.bytesLength();
 			int pieceEnd = pieceStart + bytesLength;
 			int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength
 					- (pieceEnd - bytePos);
 			if (tp.isUnicode()) {
 				charCount += toAdd / 2;
 			} else {
 				charCount += toAdd;
 			}
 		}
 		return charCount;
 	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java
@ -0,0 +1,72 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hwpf.usermodel;
 import junit.framework.TestCase;
 import java.io.FileInputStream;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.hwpf.HWPFDocument;
 public class TestBug46610 extends TestCase {
  private String dirname;
  protected void setUp() throws Exception {
    dirname = System.getProperty("HWPF.testdata.path");
  }
  public void testUtf() throws Exception {
    HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_1.doc"));
    runExtract(doc);
  }
  public void testUtf2() throws Exception {
    HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_2.doc"));
    runExtract(doc);
  }
  public void testExtraction() throws Exception {
    HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_3.doc"));
    String text = runExtract(doc);
    assertTrue(text.contains("\u0421\u0412\u041e\u042e"));
  }
  private String runExtract(HWPFDocument doc) {
    StringBuffer out = new StringBuffer();
    Range globalRange = doc.getRange();
    for (int i = 0; i < globalRange.numParagraphs(); i++) {
      Paragraph p = globalRange.getParagraph(i);
      out.append(p.text());
      out.append("\n");
      for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun characterRun = p.getCharacterRun(j);
        characterRun.text();
      }
    }
    return out.toString();
  }
 }