improved HWPF to better handle unicode, patch provided by Benjamin Engele and Maxim Valyanskiy, see Bugzilla #46610

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@786505 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-06-19 13:45:55 +00:00
parent 03b8a9686a
commit 6ce1ef7d06
16 changed files with 187 additions and 87 deletions

View File

@ -33,6 +33,7 @@
<changes>
<release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="fix">46610 - Improved HWPF to better handle unicode</action>
<action dev="POI-DEVELOPERS" type="fix">47261 - Fixed SlideShow#removeSlide to remove references to Notes</action>
<action dev="POI-DEVELOPERS" type="fix">47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links</action>
<action dev="POI-DEVELOPERS" type="fix">47384 - Fixed ExternalNameRecord to handle unicode names</action>

View File

@ -25,37 +25,28 @@ package org.apache.poi.hwpf.model;
* and characters.
*/
public abstract class BytePropertyNode extends PropertyNode {
private boolean isUnicode;
private final int startBytes;
private final int endBytes;
/**
* @param fcStart The start of the text for this property, in _bytes_
* @param fcEnd The end of the text for this property, in _bytes_
*/
public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) {
public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) {
super(
generateCp(fcStart, isUnicode),
generateCp(fcEnd, isUnicode),
translator.getCharIndex(fcStart),
translator.getCharIndex(fcEnd),
buf
);
this.isUnicode = isUnicode;
}
private static int generateCp(int val, boolean isUnicode) {
if(isUnicode)
return val/2;
return val;
this.startBytes = fcStart;
this.endBytes = fcEnd;
}
public boolean isUnicode() {
return isUnicode;
}
public int getStartBytes() {
if(isUnicode)
return getStart()*2;
return getStart();
return startBytes;
}
public int getEndBytes() {
if(isUnicode)
return getEnd()*2;
return getEnd();
return endBytes;
}
}

View File

@ -119,9 +119,8 @@ public final class CHPBinTable
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode);
CHPX insertChpx = new CHPX(0, 0, tpt,buf);
// Ensure character offsets are really characters
insertChpx.setStart(cpStart);
@ -141,7 +140,7 @@ public final class CHPBinTable
// Original, until insert at point
// New one
// Clone of original, on to the old end
CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode);
CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf());
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(chpx.getEnd());

View File

@ -60,8 +60,9 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
for (int x = 0; x < _crun; x++)
{
boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) );
_chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode));
int startAt = getStart(x);
int endAt = getEnd(x);
_chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x)));
}
}

View File

@ -34,14 +34,14 @@ import org.apache.poi.hwpf.sprm.CharacterSprmUncompressor;
public final class CHPX extends BytePropertyNode
{
public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode)
public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl)
{
super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode);
super(fcStart, fcEnd, translator, new SprmBuffer(grpprl));
}
public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode)
public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf)
{
super(fcStart, fcEnd, buf, isUnicode);
super(fcStart, fcEnd, translator ,buf);
}

View File

@ -0,0 +1,40 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
public interface CharIndexTranslator {
/**
* Calculates the char index of the given byte index.
*
* @param bytePos The character offset to check
* @return the char index
*/
int getCharIndex(int bytePos);
/**
* Is the text at the given byte offset unicode, or plain old ascii? In a
* very evil fashion, you have to actually know this to make sense of
* character and paragraph properties :(
*
* @param bytePos The character offset to check about
* @return true if the text at the given byte offset is unicode
*/
boolean isUnicodeAtByteOffset(int bytePos);
}

View File

@ -76,9 +76,8 @@ public final class PAPBinTable
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode);
PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
// Ensure character offsets are really characters
forInsert.setStart(cpStart);
@ -108,7 +107,7 @@ public final class PAPBinTable
// Original, until insert at point
// New one
// Clone of original, on to the old end
PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode);
PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(currentPap.getEnd());

View File

@ -62,14 +62,10 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage
public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt)
{
super(documentStream, offset);
for (int x = 0; x < _crun; x++) {
int startAt = getStart(x) - fcMin;
int endAt = getEnd(x) - fcMin;
boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt);
//System.err.println(startAt + " -> " + endAt + " = " + isUnicode);
_papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode));
int startAt = getStart(x);
int endAt = getEnd(x);
_papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream));
}
_fkp = null;
_dataStream = dataStream;

View File

@ -40,18 +40,18 @@ public final class PAPX extends BytePropertyNode {
private ParagraphHeight _phe;
private int _hugeGrpprlOffset = -1;
public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode)
public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream)
{
super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode);
super(fcStart, fcEnd, translator, new SprmBuffer(papx));
_phe = phe;
SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream);
if(buf != null)
_buf = buf;
}
public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode)
public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream)
{
super(fcStart, fcEnd, buf, isUnicode);
super(fcStart, fcEnd, translator, buf);
_phe = new ParagraphHeight();
buf = findHuge(buf, dataStream);
if(buf != null)

View File

@ -28,9 +28,9 @@ public final class SEPX extends BytePropertyNode
SectionDescriptor _sed;
public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode)
public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl)
{
super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode);
super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0));
_sed = sed;
}

View File

@ -61,13 +61,10 @@ public final class SectionTable
int startAt = CPtoFC(node.getStart());
int endAt = CPtoFC(node.getEnd());
boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt );
// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart);
// check for the optimization
if (fileOffset == 0xffffffff)
{
_sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart));
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
}
else
{
@ -76,7 +73,7 @@ public final class SectionTable
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
_sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart));
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
}
}
@ -138,33 +135,13 @@ public final class SectionTable
}
int FC = TP.getPieceDescriptor().getFilePosition();
int offset = CP - TP.getCP();
FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition();
if (TP.isUnicode()) {
offset = offset*2;
}
FC = FC+offset;
return FC;
}
// Ryans code
private int FCtoCP(int fc)
{
int size = _text.size();
int cp = 0;
for (int x = 0; x < size; x++)
{
TextPiece piece = (TextPiece)_text.get(x);
if (fc <= piece.getEnd())
{
cp += (fc - piece.getStart());
break;
}
else
{
cp += (piece.getEnd() - piece.getStart());
}
}
return cp;
}
public ArrayList getSections()
{
return _sections;
@ -205,7 +182,7 @@ public final class SectionTable
// Line using Ryan's FCtoCP() conversion method -
// unable to observe any effect on our testcases when using this code - piers
GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray());
GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray());
plex.addProperty(property);

View File

@ -37,7 +37,7 @@ import java.util.List;
* convertion.
* @author Ryan Ackley
*/
public final class TextPieceTable
public final class TextPieceTable implements CharIndexTranslator
{
protected ArrayList _textPieces = new ArrayList();
//int _multiple;
@ -150,31 +150,25 @@ public final class TextPieceTable
// If they ask off the end, just go with the last one...
return lastWas;
}
/**
* Is the text at the given byte offset
* unicode, or plain old ascii?
* In a very evil fashion, you have to actually
* know this to make sense of character and
* paragraph properties :(
* @param bytePos The character offset to check about
*/
public boolean isUnicodeAtByteOffset(int bytePos) {
boolean lastWas = false;
int curByte = 0;
Iterator it = _textPieces.iterator();
while(it.hasNext()) {
TextPiece tp = (TextPiece)it.next();
int nextByte = curByte + tp.bytesLength();
int curByte = tp.getPieceDescriptor().getFilePosition();
int pieceEnd = curByte + tp.bytesLength();
// If the text piece covers the character, all good
if(curByte <= bytePos && nextByte >= bytePos) {
if(curByte <= bytePos && pieceEnd > bytePos) {
return tp.isUnicode();
}
// Otherwise keep track for the last one
lastWas = tp.isUnicode();
// Move along
curByte = nextByte;
curByte = pieceEnd;
}
// If they ask off the end, just go with the last one...
@ -268,4 +262,34 @@ public final class TextPieceTable
}
return false;
}
/* (non-Javadoc)
* @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int)
*/
public int getCharIndex(int bytePos) {
int charCount = 0;
Iterator it = _textPieces.iterator();
while (it.hasNext()) {
TextPiece tp = (TextPiece) it.next();
int pieceStart = tp.getPieceDescriptor().getFilePosition();
if(pieceStart >= bytePos) {
break;
}
int bytesLength = tp.bytesLength();
int pieceEnd = pieceStart + bytesLength;
int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength
- (pieceEnd - bytePos);
if (tp.isUnicode()) {
charCount += toAdd / 2;
} else {
charCount += toAdd;
}
}
return charCount;
}
}

View File

@ -0,0 +1,72 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.usermodel;
import junit.framework.TestCase;
import java.io.FileInputStream;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.HWPFDocument;
public class TestBug46610 extends TestCase {
private String dirname;
protected void setUp() throws Exception {
dirname = System.getProperty("HWPF.testdata.path");
}
public void testUtf() throws Exception {
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_1.doc"));
runExtract(doc);
}
public void testUtf2() throws Exception {
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_2.doc"));
runExtract(doc);
}
public void testExtraction() throws Exception {
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_3.doc"));
String text = runExtract(doc);
assertTrue(text.contains("\u0421\u0412\u041e\u042e"));
}
private String runExtract(HWPFDocument doc) {
StringBuffer out = new StringBuffer();
Range globalRange = doc.getRange();
for (int i = 0; i < globalRange.numParagraphs(); i++) {
Paragraph p = globalRange.getParagraph(i);
out.append(p.text());
out.append("\n");
for (int j = 0; j < p.numCharacterRuns(); j++) {
CharacterRun characterRun = p.getCharacterRun(j);
characterRun.text();
}
}
return out.toString();
}
}