improved HWPF to better handle unicode, patch provided by Benjamin Engele and Maxim Valyanskiy, see Bugzilla #46610
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@786505 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
03b8a9686a
commit
6ce1ef7d06
@ -33,6 +33,7 @@
|
|||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.5-beta7" date="2009-??-??">
|
<release version="3.5-beta7" date="2009-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="fix">46610 - Improved HWPF to better handle unicode</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47261 - Fixed SlideShow#removeSlide to remove references to Notes</action>
|
<action dev="POI-DEVELOPERS" type="fix">47261 - Fixed SlideShow#removeSlide to remove references to Notes</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links</action>
|
<action dev="POI-DEVELOPERS" type="fix">47375 - Fixed HSSFHyperlink to correctly set inter-sheet and file links</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">47384 - Fixed ExternalNameRecord to handle unicode names</action>
|
<action dev="POI-DEVELOPERS" type="fix">47384 - Fixed ExternalNameRecord to handle unicode names</action>
|
||||||
|
@ -25,37 +25,28 @@ package org.apache.poi.hwpf.model;
|
|||||||
* and characters.
|
* and characters.
|
||||||
*/
|
*/
|
||||||
public abstract class BytePropertyNode extends PropertyNode {
|
public abstract class BytePropertyNode extends PropertyNode {
|
||||||
private boolean isUnicode;
|
private final int startBytes;
|
||||||
|
private final int endBytes;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param fcStart The start of the text for this property, in _bytes_
|
* @param fcStart The start of the text for this property, in _bytes_
|
||||||
* @param fcEnd The end of the text for this property, in _bytes_
|
* @param fcEnd The end of the text for this property, in _bytes_
|
||||||
*/
|
*/
|
||||||
public BytePropertyNode(int fcStart, int fcEnd, Object buf, boolean isUnicode) {
|
public BytePropertyNode(int fcStart, int fcEnd, CharIndexTranslator translator, Object buf) {
|
||||||
super(
|
super(
|
||||||
generateCp(fcStart, isUnicode),
|
translator.getCharIndex(fcStart),
|
||||||
generateCp(fcEnd, isUnicode),
|
translator.getCharIndex(fcEnd),
|
||||||
buf
|
buf
|
||||||
);
|
);
|
||||||
this.isUnicode = isUnicode;
|
this.startBytes = fcStart;
|
||||||
}
|
this.endBytes = fcEnd;
|
||||||
private static int generateCp(int val, boolean isUnicode) {
|
|
||||||
if(isUnicode)
|
|
||||||
return val/2;
|
|
||||||
return val;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isUnicode() {
|
|
||||||
return isUnicode;
|
|
||||||
}
|
|
||||||
public int getStartBytes() {
|
public int getStartBytes() {
|
||||||
if(isUnicode)
|
return startBytes;
|
||||||
return getStart()*2;
|
|
||||||
return getStart();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getEndBytes() {
|
public int getEndBytes() {
|
||||||
if(isUnicode)
|
return endBytes;
|
||||||
return getEnd()*2;
|
|
||||||
return getEnd();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -119,9 +119,8 @@ public final class CHPBinTable
|
|||||||
|
|
||||||
public void insert(int listIndex, int cpStart, SprmBuffer buf)
|
public void insert(int listIndex, int cpStart, SprmBuffer buf)
|
||||||
{
|
{
|
||||||
boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
|
|
||||||
|
|
||||||
CHPX insertChpx = new CHPX(0, 0, buf, needsToBeUnicode);
|
CHPX insertChpx = new CHPX(0, 0, tpt,buf);
|
||||||
|
|
||||||
// Ensure character offsets are really characters
|
// Ensure character offsets are really characters
|
||||||
insertChpx.setStart(cpStart);
|
insertChpx.setStart(cpStart);
|
||||||
@ -141,7 +140,7 @@ public final class CHPBinTable
|
|||||||
// Original, until insert at point
|
// Original, until insert at point
|
||||||
// New one
|
// New one
|
||||||
// Clone of original, on to the old end
|
// Clone of original, on to the old end
|
||||||
CHPX clone = new CHPX(0, 0, chpx.getSprmBuf(), needsToBeUnicode);
|
CHPX clone = new CHPX(0, 0, tpt,chpx.getSprmBuf());
|
||||||
// Again ensure contains character based offsets no matter what
|
// Again ensure contains character based offsets no matter what
|
||||||
clone.setStart(cpStart);
|
clone.setStart(cpStart);
|
||||||
clone.setEnd(chpx.getEnd());
|
clone.setEnd(chpx.getEnd());
|
||||||
|
@ -60,8 +60,9 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
|
|||||||
|
|
||||||
for (int x = 0; x < _crun; x++)
|
for (int x = 0; x < _crun; x++)
|
||||||
{
|
{
|
||||||
boolean isUnicode = tpt.isUnicodeAtByteOffset( getStart(x) );
|
int startAt = getStart(x);
|
||||||
_chpxList.add(new CHPX(getStart(x) - fcMin, getEnd(x) - fcMin, getGrpprl(x), isUnicode));
|
int endAt = getEnd(x);
|
||||||
|
_chpxList.add(new CHPX(startAt, endAt, tpt, getGrpprl(x)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,14 +34,14 @@ import org.apache.poi.hwpf.sprm.CharacterSprmUncompressor;
|
|||||||
public final class CHPX extends BytePropertyNode
|
public final class CHPX extends BytePropertyNode
|
||||||
{
|
{
|
||||||
|
|
||||||
public CHPX(int fcStart, int fcEnd, byte[] grpprl, boolean isUnicode)
|
public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] grpprl)
|
||||||
{
|
{
|
||||||
super(fcStart, fcEnd, new SprmBuffer(grpprl), isUnicode);
|
super(fcStart, fcEnd, translator, new SprmBuffer(grpprl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public CHPX(int fcStart, int fcEnd, SprmBuffer buf, boolean isUnicode)
|
public CHPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf)
|
||||||
{
|
{
|
||||||
super(fcStart, fcEnd, buf, isUnicode);
|
super(fcStart, fcEnd, translator ,buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
40
src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
Executable file
40
src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
public interface CharIndexTranslator {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the char index of the given byte index.
|
||||||
|
*
|
||||||
|
* @param bytePos The character offset to check
|
||||||
|
* @return the char index
|
||||||
|
*/
|
||||||
|
int getCharIndex(int bytePos);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Is the text at the given byte offset unicode, or plain old ascii? In a
|
||||||
|
* very evil fashion, you have to actually know this to make sense of
|
||||||
|
* character and paragraph properties :(
|
||||||
|
*
|
||||||
|
* @param bytePos The character offset to check about
|
||||||
|
* @return true if the text at the given byte offset is unicode
|
||||||
|
*/
|
||||||
|
boolean isUnicodeAtByteOffset(int bytePos);
|
||||||
|
|
||||||
|
}
|
@ -76,9 +76,8 @@ public final class PAPBinTable
|
|||||||
|
|
||||||
public void insert(int listIndex, int cpStart, SprmBuffer buf)
|
public void insert(int listIndex, int cpStart, SprmBuffer buf)
|
||||||
{
|
{
|
||||||
boolean needsToBeUnicode = tpt.isUnicodeAtCharOffset(cpStart);
|
|
||||||
|
|
||||||
PAPX forInsert = new PAPX(0, 0, buf, _dataStream, needsToBeUnicode);
|
PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
|
||||||
|
|
||||||
// Ensure character offsets are really characters
|
// Ensure character offsets are really characters
|
||||||
forInsert.setStart(cpStart);
|
forInsert.setStart(cpStart);
|
||||||
@ -108,7 +107,7 @@ public final class PAPBinTable
|
|||||||
// Original, until insert at point
|
// Original, until insert at point
|
||||||
// New one
|
// New one
|
||||||
// Clone of original, on to the old end
|
// Clone of original, on to the old end
|
||||||
PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream, needsToBeUnicode);
|
PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
|
||||||
// Again ensure contains character based offsets no matter what
|
// Again ensure contains character based offsets no matter what
|
||||||
clone.setStart(cpStart);
|
clone.setStart(cpStart);
|
||||||
clone.setEnd(currentPap.getEnd());
|
clone.setEnd(currentPap.getEnd());
|
||||||
|
@ -62,14 +62,10 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage
|
|||||||
public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt)
|
public PAPFormattedDiskPage(byte[] documentStream, byte[] dataStream, int offset, int fcMin, TextPieceTable tpt)
|
||||||
{
|
{
|
||||||
super(documentStream, offset);
|
super(documentStream, offset);
|
||||||
|
|
||||||
for (int x = 0; x < _crun; x++) {
|
for (int x = 0; x < _crun; x++) {
|
||||||
int startAt = getStart(x) - fcMin;
|
int startAt = getStart(x);
|
||||||
int endAt = getEnd(x) - fcMin;
|
int endAt = getEnd(x);
|
||||||
boolean isUnicode = tpt.isUnicodeAtByteOffset(startAt);
|
_papxList.add(new PAPX(startAt, endAt, tpt, getGrpprl(x), getParagraphHeight(x), dataStream));
|
||||||
//System.err.println(startAt + " -> " + endAt + " = " + isUnicode);
|
|
||||||
|
|
||||||
_papxList.add(new PAPX(startAt, endAt, getGrpprl(x), getParagraphHeight(x), dataStream, isUnicode));
|
|
||||||
}
|
}
|
||||||
_fkp = null;
|
_fkp = null;
|
||||||
_dataStream = dataStream;
|
_dataStream = dataStream;
|
||||||
|
@ -40,18 +40,18 @@ public final class PAPX extends BytePropertyNode {
|
|||||||
private ParagraphHeight _phe;
|
private ParagraphHeight _phe;
|
||||||
private int _hugeGrpprlOffset = -1;
|
private int _hugeGrpprlOffset = -1;
|
||||||
|
|
||||||
public PAPX(int fcStart, int fcEnd, byte[] papx, ParagraphHeight phe, byte[] dataStream, boolean isUnicode)
|
public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, byte[] papx, ParagraphHeight phe, byte[] dataStream)
|
||||||
{
|
{
|
||||||
super(fcStart, fcEnd, new SprmBuffer(papx), isUnicode);
|
super(fcStart, fcEnd, translator, new SprmBuffer(papx));
|
||||||
_phe = phe;
|
_phe = phe;
|
||||||
SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream);
|
SprmBuffer buf = findHuge(new SprmBuffer(papx), dataStream);
|
||||||
if(buf != null)
|
if(buf != null)
|
||||||
_buf = buf;
|
_buf = buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PAPX(int fcStart, int fcEnd, SprmBuffer buf, byte[] dataStream, boolean isUnicode)
|
public PAPX(int fcStart, int fcEnd, CharIndexTranslator translator, SprmBuffer buf, byte[] dataStream)
|
||||||
{
|
{
|
||||||
super(fcStart, fcEnd, buf, isUnicode);
|
super(fcStart, fcEnd, translator, buf);
|
||||||
_phe = new ParagraphHeight();
|
_phe = new ParagraphHeight();
|
||||||
buf = findHuge(buf, dataStream);
|
buf = findHuge(buf, dataStream);
|
||||||
if(buf != null)
|
if(buf != null)
|
||||||
|
@ -28,9 +28,9 @@ public final class SEPX extends BytePropertyNode
|
|||||||
|
|
||||||
SectionDescriptor _sed;
|
SectionDescriptor _sed;
|
||||||
|
|
||||||
public SEPX(SectionDescriptor sed, int start, int end, byte[] grpprl, boolean isUnicode)
|
public SEPX(SectionDescriptor sed, int start, int end, CharIndexTranslator translator, byte[] grpprl)
|
||||||
{
|
{
|
||||||
super(start, end, SectionSprmUncompressor.uncompressSEP(grpprl, 0), isUnicode);
|
super(start, end, translator, SectionSprmUncompressor.uncompressSEP(grpprl, 0));
|
||||||
_sed = sed;
|
_sed = sed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,13 +61,10 @@ public final class SectionTable
|
|||||||
int startAt = CPtoFC(node.getStart());
|
int startAt = CPtoFC(node.getStart());
|
||||||
int endAt = CPtoFC(node.getEnd());
|
int endAt = CPtoFC(node.getEnd());
|
||||||
|
|
||||||
boolean isUnicodeAtStart = tpt.isUnicodeAtByteOffset( startAt );
|
|
||||||
// System.err.println(startAt + " -> " + endAt + " = " + isUnicodeAtStart);
|
|
||||||
|
|
||||||
// check for the optimization
|
// check for the optimization
|
||||||
if (fileOffset == 0xffffffff)
|
if (fileOffset == 0xffffffff)
|
||||||
{
|
{
|
||||||
_sections.add(new SEPX(sed, startAt, endAt, new byte[0], isUnicodeAtStart));
|
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -76,7 +73,7 @@ public final class SectionTable
|
|||||||
byte[] buf = new byte[sepxSize];
|
byte[] buf = new byte[sepxSize];
|
||||||
fileOffset += LittleEndian.SHORT_SIZE;
|
fileOffset += LittleEndian.SHORT_SIZE;
|
||||||
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
||||||
_sections.add(new SEPX(sed, startAt, endAt, buf, isUnicodeAtStart));
|
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,33 +135,13 @@ public final class SectionTable
|
|||||||
}
|
}
|
||||||
int FC = TP.getPieceDescriptor().getFilePosition();
|
int FC = TP.getPieceDescriptor().getFilePosition();
|
||||||
int offset = CP - TP.getCP();
|
int offset = CP - TP.getCP();
|
||||||
FC = FC+offset-((TextPiece)_text.get(0)).getPieceDescriptor().getFilePosition();
|
if (TP.isUnicode()) {
|
||||||
|
offset = offset*2;
|
||||||
|
}
|
||||||
|
FC = FC+offset;
|
||||||
return FC;
|
return FC;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ryans code
|
|
||||||
private int FCtoCP(int fc)
|
|
||||||
{
|
|
||||||
int size = _text.size();
|
|
||||||
int cp = 0;
|
|
||||||
for (int x = 0; x < size; x++)
|
|
||||||
{
|
|
||||||
TextPiece piece = (TextPiece)_text.get(x);
|
|
||||||
|
|
||||||
if (fc <= piece.getEnd())
|
|
||||||
{
|
|
||||||
cp += (fc - piece.getStart());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
cp += (piece.getEnd() - piece.getStart());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return cp;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public ArrayList getSections()
|
public ArrayList getSections()
|
||||||
{
|
{
|
||||||
return _sections;
|
return _sections;
|
||||||
@ -205,7 +182,7 @@ public final class SectionTable
|
|||||||
|
|
||||||
// Line using Ryan's FCtoCP() conversion method -
|
// Line using Ryan's FCtoCP() conversion method -
|
||||||
// unable to observe any effect on our testcases when using this code - piers
|
// unable to observe any effect on our testcases when using this code - piers
|
||||||
GenericPropertyNode property = new GenericPropertyNode(FCtoCP(sepx.getStartBytes()), FCtoCP(sepx.getEndBytes()), sed.toByteArray());
|
GenericPropertyNode property = new GenericPropertyNode(tpt.getCharIndex(sepx.getStartBytes()), tpt.getCharIndex(sepx.getEndBytes()), sed.toByteArray());
|
||||||
|
|
||||||
|
|
||||||
plex.addProperty(property);
|
plex.addProperty(property);
|
||||||
|
@ -37,7 +37,7 @@ import java.util.List;
|
|||||||
* convertion.
|
* convertion.
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public final class TextPieceTable
|
public final class TextPieceTable implements CharIndexTranslator
|
||||||
{
|
{
|
||||||
protected ArrayList _textPieces = new ArrayList();
|
protected ArrayList _textPieces = new ArrayList();
|
||||||
//int _multiple;
|
//int _multiple;
|
||||||
@ -150,31 +150,25 @@ public final class TextPieceTable
|
|||||||
// If they ask off the end, just go with the last one...
|
// If they ask off the end, just go with the last one...
|
||||||
return lastWas;
|
return lastWas;
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* Is the text at the given byte offset
|
|
||||||
* unicode, or plain old ascii?
|
|
||||||
* In a very evil fashion, you have to actually
|
|
||||||
* know this to make sense of character and
|
|
||||||
* paragraph properties :(
|
|
||||||
* @param bytePos The character offset to check about
|
|
||||||
*/
|
|
||||||
public boolean isUnicodeAtByteOffset(int bytePos) {
|
public boolean isUnicodeAtByteOffset(int bytePos) {
|
||||||
boolean lastWas = false;
|
boolean lastWas = false;
|
||||||
int curByte = 0;
|
|
||||||
|
|
||||||
Iterator it = _textPieces.iterator();
|
Iterator it = _textPieces.iterator();
|
||||||
while(it.hasNext()) {
|
while(it.hasNext()) {
|
||||||
TextPiece tp = (TextPiece)it.next();
|
TextPiece tp = (TextPiece)it.next();
|
||||||
int nextByte = curByte + tp.bytesLength();
|
int curByte = tp.getPieceDescriptor().getFilePosition();
|
||||||
|
int pieceEnd = curByte + tp.bytesLength();
|
||||||
|
|
||||||
// If the text piece covers the character, all good
|
// If the text piece covers the character, all good
|
||||||
if(curByte <= bytePos && nextByte >= bytePos) {
|
if(curByte <= bytePos && pieceEnd > bytePos) {
|
||||||
return tp.isUnicode();
|
return tp.isUnicode();
|
||||||
}
|
}
|
||||||
// Otherwise keep track for the last one
|
// Otherwise keep track for the last one
|
||||||
lastWas = tp.isUnicode();
|
lastWas = tp.isUnicode();
|
||||||
// Move along
|
// Move along
|
||||||
curByte = nextByte;
|
curByte = pieceEnd;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If they ask off the end, just go with the last one...
|
// If they ask off the end, just go with the last one...
|
||||||
@ -268,4 +262,34 @@ public final class TextPieceTable
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
/* (non-Javadoc)
|
||||||
|
* @see org.apache.poi.hwpf.model.CharIndexTranslator#getLengthInChars(int)
|
||||||
|
*/
|
||||||
|
public int getCharIndex(int bytePos) {
|
||||||
|
int charCount = 0;
|
||||||
|
|
||||||
|
Iterator it = _textPieces.iterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
TextPiece tp = (TextPiece) it.next();
|
||||||
|
int pieceStart = tp.getPieceDescriptor().getFilePosition();
|
||||||
|
if(pieceStart >= bytePos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytesLength = tp.bytesLength();
|
||||||
|
int pieceEnd = pieceStart + bytesLength;
|
||||||
|
|
||||||
|
int toAdd = bytePos > pieceEnd ? bytesLength : bytesLength
|
||||||
|
- (pieceEnd - bytePos);
|
||||||
|
|
||||||
|
if (tp.isUnicode()) {
|
||||||
|
charCount += toAdd / 2;
|
||||||
|
} else {
|
||||||
|
charCount += toAdd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return charCount;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_1.doc
Executable file
Binary file not shown.
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_2.doc
Executable file
Binary file not shown.
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/Bug46610_3.doc
Executable file
Binary file not shown.
72
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java
Executable file
72
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBug46610.java
Executable file
@ -0,0 +1,72 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
|
||||||
|
public class TestBug46610 extends TestCase {
|
||||||
|
private String dirname;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
dirname = System.getProperty("HWPF.testdata.path");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUtf() throws Exception {
|
||||||
|
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_1.doc"));
|
||||||
|
|
||||||
|
runExtract(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUtf2() throws Exception {
|
||||||
|
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_2.doc"));
|
||||||
|
|
||||||
|
runExtract(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExtraction() throws Exception {
|
||||||
|
HWPFDocument doc = new HWPFDocument(new FileInputStream(dirname + "/Bug46610_3.doc"));
|
||||||
|
|
||||||
|
String text = runExtract(doc);
|
||||||
|
|
||||||
|
assertTrue(text.contains("\u0421\u0412\u041e\u042e"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String runExtract(HWPFDocument doc) {
|
||||||
|
StringBuffer out = new StringBuffer();
|
||||||
|
|
||||||
|
Range globalRange = doc.getRange();
|
||||||
|
for (int i = 0; i < globalRange.numParagraphs(); i++) {
|
||||||
|
Paragraph p = globalRange.getParagraph(i);
|
||||||
|
out.append(p.text());
|
||||||
|
out.append("\n");
|
||||||
|
for (int j = 0; j < p.numCharacterRuns(); j++) {
|
||||||
|
CharacterRun characterRun = p.getCharacterRun(j);
|
||||||
|
characterRun.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.toString();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user