bug 50955 and bug 60953 improve Big5 reader; ensure one character
per byte pair git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790172 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4b0e6dc048
commit
1bfd5f6585
@ -31,10 +31,12 @@ import java.util.Set;
|
||||
public class CodePageUtil
|
||||
{
|
||||
|
||||
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
|
||||
public static final Set<Charset> DOUBLE_BYTE_CHARSETS
|
||||
= new HashSet<Charset>();
|
||||
|
||||
static {
|
||||
DOUBLE_BYTE_CHARSETS.add(StringUtil.BIG5);
|
||||
//others?
|
||||
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
|
||||
}
|
||||
|
||||
/** <p>Codepage 037, a special case</p> */
|
||||
@ -450,4 +452,26 @@ public class CodePageUtil
|
||||
return "cp" + codepage;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This tries to convert a LE byte array in cp950
|
||||
* (Microsoft's dialect of Big5) to a String.
|
||||
* We know MS zero-padded ascii, and we drop those.
|
||||
* There may be areas for improvement in this.
|
||||
*
|
||||
* @param data
|
||||
* @param offset
|
||||
* @param lengthInBytes
|
||||
* @return
|
||||
*/
|
||||
public static String cp950ToString(byte[] data, int offset, int lengthInBytes) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes);
|
||||
int c = reader.read();
|
||||
while (c != -1) {
|
||||
sb.append((char)c);
|
||||
c = reader.read();
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
@ -1,107 +0,0 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.util;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
/**
|
||||
* Stream that converts MSOffice's way of storing Big5, with
|
||||
* zero-byte padding for ASCII and in LittleEndianOrder.
|
||||
*/
|
||||
@Internal
|
||||
public class LittleEndianBig5Stream extends ByteArrayInputStream {
|
||||
private static final int EOF = -1;
|
||||
private static final int INVALID_PAIR = -2;
|
||||
private static final int EMPTY_TRAILING = -3;
|
||||
|
||||
//the char that is logically trailing in Big5 encoding
|
||||
//however in LittleEndian order, this is the first encountered.
|
||||
int trailing = EMPTY_TRAILING;
|
||||
public LittleEndianBig5Stream(byte[] buf) {
|
||||
super(buf);
|
||||
}
|
||||
|
||||
public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
|
||||
super(buf, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() {
|
||||
|
||||
if (trailing != EMPTY_TRAILING) {
|
||||
int tmp = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return tmp;
|
||||
}
|
||||
int leading = readNext();
|
||||
while (leading == INVALID_PAIR) {
|
||||
leading = readNext();
|
||||
}
|
||||
|
||||
if (leading == EOF) {
|
||||
return EOF;
|
||||
}
|
||||
return leading;
|
||||
}
|
||||
|
||||
//returns leading, sets trailing appropriately
|
||||
//returns -1 if it hits the end of the stream
|
||||
//returns -2 for an invalid big5 code pair
|
||||
private final int readNext() {
|
||||
trailing = super.read();
|
||||
if (trailing == -1) {
|
||||
return EOF;
|
||||
}
|
||||
int leading = super.read();
|
||||
if (leading == EOF) {
|
||||
return EOF;
|
||||
}
|
||||
int lead = leading&0xff;
|
||||
if (lead > 0x80) {
|
||||
return leading;
|
||||
} else if (lead == 0) {
|
||||
int ret = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return ret;
|
||||
} else {
|
||||
int ret = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return ret;
|
||||
//return INVALID_PAIR;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] buff, int off, int len) {
|
||||
int bytesRead = 0;
|
||||
for (int i = off; i < off+len; i++) {
|
||||
int b = read();
|
||||
if (b == -1) {
|
||||
if (bytesRead == 0) {
|
||||
return -1;
|
||||
} else {
|
||||
return bytesRead;
|
||||
}
|
||||
}
|
||||
bytesRead++;
|
||||
buff[i] = (byte)b;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
}
|
480
src/java/org/apache/poi/util/LittleEndianCP950Reader.java
Normal file
480
src/java/org/apache/poi/util/LittleEndianCP950Reader.java
Normal file
@ -0,0 +1,480 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.util;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
|
||||
/**
|
||||
* Stream that converts CP950 (MSOffice's dialect of Big5), with
|
||||
* zero-byte padding for ASCII and in LittleEndianOrder.
|
||||
*/
|
||||
@Internal
|
||||
public class LittleEndianCP950Reader extends Reader {
|
||||
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class);
|
||||
|
||||
|
||||
private static final char UNMAPPABLE = (char) '?';
|
||||
private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2);
|
||||
private final CharBuffer charBuffer = CharBuffer.allocate(2);
|
||||
private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder();
|
||||
|
||||
//https://en.wikipedia.org/wiki/Code_page_950
|
||||
//see private use area
|
||||
private final static char range1Low = '\u8140';
|
||||
private final static char range1High = '\u8DFE';
|
||||
private final static char range2Low = '\u8E40';
|
||||
private final static char range2High = '\uA0FE';
|
||||
private final static char range3Low = '\uC6A1';
|
||||
private final static char range3High = '\uC8FE';
|
||||
private final static char range4Low = '\uFA40';
|
||||
private final static char range4High = '\uFEFE';
|
||||
|
||||
private final byte[] data;
|
||||
private final int startOffset;
|
||||
private final int length;
|
||||
private int offset;
|
||||
private int trailing;
|
||||
private int leading;
|
||||
int cnt = 0;
|
||||
//the char that is logically trailing in Big5 encoding
|
||||
//however in LittleEndian order, this is the first encountered.
|
||||
public LittleEndianCP950Reader(byte[] data) {
|
||||
this(data, 0, data.length);
|
||||
}
|
||||
|
||||
public LittleEndianCP950Reader(byte[] data, int offset, int length) {
|
||||
this.data = data;
|
||||
this.startOffset = offset;
|
||||
this.offset = startOffset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() {
|
||||
if (offset + 1 > data.length || offset - startOffset > length) {
|
||||
return -1;
|
||||
}
|
||||
trailing = data[offset++] & 0xff;
|
||||
leading = data[offset++] & 0xff;
|
||||
decoder.reset();
|
||||
if (leading < 0x81) {
|
||||
//return trailing alone
|
||||
//there may be some subtleties here
|
||||
return trailing;
|
||||
} else if (leading == 0xf9) {
|
||||
return handleF9(trailing);
|
||||
} else {
|
||||
int ch = (leading << 8) + trailing;
|
||||
if (ch >= range1Low && ch <= range1High) {
|
||||
return handleRange1(leading, trailing);
|
||||
} else if (ch >= range2Low && ch <= range2High) {
|
||||
return handleRange2(leading, trailing);
|
||||
} else if (ch >= range3Low && ch <= range3High) {
|
||||
return handleRange3(leading, trailing);
|
||||
} else if (ch >= range4Low && ch <= range4High) {
|
||||
return handleRange4(leading, trailing);
|
||||
}
|
||||
|
||||
charBuffer.clear();
|
||||
doubleByteBuffer.clear();
|
||||
doubleByteBuffer.put((byte) leading);
|
||||
doubleByteBuffer.put((byte) trailing);
|
||||
doubleByteBuffer.flip();
|
||||
decoder.decode(doubleByteBuffer, charBuffer, true);
|
||||
charBuffer.flip();
|
||||
|
||||
if (charBuffer.length() == 0) {
|
||||
LOGGER.log(POILogger.WARN, "couldn't create char for: "
|
||||
+ Integer.toString((leading & 0xff), 16)
|
||||
+ " " + Integer.toString((trailing & 0xff), 16));
|
||||
return UNMAPPABLE;
|
||||
} else {
|
||||
return Character.codePointAt(charBuffer, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
//there may be some efficiencies, but this should do for now.
|
||||
|
||||
for (int i = off; i < off + len; i++) {
|
||||
int c = read();
|
||||
if (c == -1) {
|
||||
return i - off;
|
||||
}
|
||||
cbuf[i] = (char) c;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
private int handleRange1(int leading, int trailing) {
|
||||
return (0xeeb8 + (157 * (leading - 0x81))) +
|
||||
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
|
||||
}
|
||||
|
||||
private int handleRange2(int leading, int trailing) {
|
||||
return (0xe311 + (157 * (leading - 0x8e))) +
|
||||
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
|
||||
}
|
||||
|
||||
private int handleRange3(int leading, int trailing) {
|
||||
return (0xf672 + (157 * (leading - 0xc6))) +
|
||||
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
|
||||
}
|
||||
|
||||
private int handleRange4(int leading, int trailing) {
|
||||
return (0xe000 + (157 * (leading - 0xfa))) +
|
||||
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
|
||||
}
|
||||
|
||||
private int handleF9(int trailing) {
|
||||
switch (trailing) {
|
||||
case 0x40:
|
||||
return 0x7e98;
|
||||
case 0x41:
|
||||
return 0x7e9b;
|
||||
case 0x42:
|
||||
return 0x7e99;
|
||||
case 0x43:
|
||||
return 0x81e0;
|
||||
case 0x44:
|
||||
return 0x81e1;
|
||||
case 0x45:
|
||||
return 0x8646;
|
||||
case 0x46:
|
||||
return 0x8647;
|
||||
case 0x47:
|
||||
return 0x8648;
|
||||
case 0x48:
|
||||
return 0x8979;
|
||||
case 0x49:
|
||||
return 0x897a;
|
||||
case 0x4a:
|
||||
return 0x897c;
|
||||
case 0x4b:
|
||||
return 0x897b;
|
||||
case 0x4c:
|
||||
return 0x89ff;
|
||||
case 0x4d:
|
||||
return 0x8b98;
|
||||
case 0x4e:
|
||||
return 0x8b99;
|
||||
case 0x4f:
|
||||
return 0x8ea5;
|
||||
case 0x50:
|
||||
return 0x8ea4;
|
||||
case 0x51:
|
||||
return 0x8ea3;
|
||||
case 0x52:
|
||||
return 0x946e;
|
||||
case 0x53:
|
||||
return 0x946d;
|
||||
case 0x54:
|
||||
return 0x946f;
|
||||
case 0x55:
|
||||
return 0x9471;
|
||||
case 0x56:
|
||||
return 0x9473;
|
||||
case 0x57:
|
||||
return 0x9749;
|
||||
case 0x58:
|
||||
return 0x9872;
|
||||
case 0x59:
|
||||
return 0x995f;
|
||||
case 0x5a:
|
||||
return 0x9c68;
|
||||
case 0x5b:
|
||||
return 0x9c6e;
|
||||
case 0x5c:
|
||||
return 0x9c6d;
|
||||
case 0x5d:
|
||||
return 0x9e0b;
|
||||
case 0x5e:
|
||||
return 0x9e0d;
|
||||
case 0x5f:
|
||||
return 0x9e10;
|
||||
case 0x60:
|
||||
return 0x9e0f;
|
||||
case 0x61:
|
||||
return 0x9e12;
|
||||
case 0x62:
|
||||
return 0x9e11;
|
||||
case 0x63:
|
||||
return 0x9ea1;
|
||||
case 0x64:
|
||||
return 0x9ef5;
|
||||
case 0x65:
|
||||
return 0x9f09;
|
||||
case 0x66:
|
||||
return 0x9f47;
|
||||
case 0x67:
|
||||
return 0x9f78;
|
||||
case 0x68:
|
||||
return 0x9f7b;
|
||||
case 0x69:
|
||||
return 0x9f7a;
|
||||
case 0x6a:
|
||||
return 0x9f79;
|
||||
case 0x6b:
|
||||
return 0x571e;
|
||||
case 0x6c:
|
||||
return 0x7066;
|
||||
case 0x6d:
|
||||
return 0x7c6f;
|
||||
case 0x6e:
|
||||
return 0x883c;
|
||||
case 0x6f:
|
||||
return 0x8db2;
|
||||
case 0x70:
|
||||
return 0x8ea6;
|
||||
case 0x71:
|
||||
return 0x91c3;
|
||||
case 0x72:
|
||||
return 0x9474;
|
||||
case 0x73:
|
||||
return 0x9478;
|
||||
case 0x74:
|
||||
return 0x9476;
|
||||
case 0x75:
|
||||
return 0x9475;
|
||||
case 0x76:
|
||||
return 0x9a60;
|
||||
case 0x77:
|
||||
return 0x9c74;
|
||||
case 0x78:
|
||||
return 0x9c73;
|
||||
case 0x79:
|
||||
return 0x9c71;
|
||||
case 0x7a:
|
||||
return 0x9c75;
|
||||
case 0x7b:
|
||||
return 0x9e14;
|
||||
case 0x7c:
|
||||
return 0x9e13;
|
||||
case 0x7d:
|
||||
return 0x9ef6;
|
||||
case 0x7e:
|
||||
return 0x9f0a;
|
||||
case 0xa1:
|
||||
return 0x9fa4;
|
||||
case 0xa2:
|
||||
return 0x7068;
|
||||
case 0xa3:
|
||||
return 0x7065;
|
||||
case 0xa4:
|
||||
return 0x7cf7;
|
||||
case 0xa5:
|
||||
return 0x866a;
|
||||
case 0xa6:
|
||||
return 0x883e;
|
||||
case 0xa7:
|
||||
return 0x883d;
|
||||
case 0xa8:
|
||||
return 0x883f;
|
||||
case 0xa9:
|
||||
return 0x8b9e;
|
||||
case 0xaa:
|
||||
return 0x8c9c;
|
||||
case 0xab:
|
||||
return 0x8ea9;
|
||||
case 0xac:
|
||||
return 0x8ec9;
|
||||
case 0xad:
|
||||
return 0x974b;
|
||||
case 0xae:
|
||||
return 0x9873;
|
||||
case 0xaf:
|
||||
return 0x9874;
|
||||
case 0xb0:
|
||||
return 0x98cc;
|
||||
case 0xb1:
|
||||
return 0x9961;
|
||||
case 0xb2:
|
||||
return 0x99ab;
|
||||
case 0xb3:
|
||||
return 0x9a64;
|
||||
case 0xb4:
|
||||
return 0x9a66;
|
||||
case 0xb5:
|
||||
return 0x9a67;
|
||||
case 0xb6:
|
||||
return 0x9b24;
|
||||
case 0xb7:
|
||||
return 0x9e15;
|
||||
case 0xb8:
|
||||
return 0x9e17;
|
||||
case 0xb9:
|
||||
return 0x9f48;
|
||||
case 0xba:
|
||||
return 0x6207;
|
||||
case 0xbb:
|
||||
return 0x6b1e;
|
||||
case 0xbc:
|
||||
return 0x7227;
|
||||
case 0xbd:
|
||||
return 0x864c;
|
||||
case 0xbe:
|
||||
return 0x8ea8;
|
||||
case 0xbf:
|
||||
return 0x9482;
|
||||
case 0xc0:
|
||||
return 0x9480;
|
||||
case 0xc1:
|
||||
return 0x9481;
|
||||
case 0xc2:
|
||||
return 0x9a69;
|
||||
case 0xc3:
|
||||
return 0x9a68;
|
||||
case 0xc4:
|
||||
return 0x9b2e;
|
||||
case 0xc5:
|
||||
return 0x9e19;
|
||||
case 0xc6:
|
||||
return 0x7229;
|
||||
case 0xc7:
|
||||
return 0x864b;
|
||||
case 0xc8:
|
||||
return 0x8b9f;
|
||||
case 0xc9:
|
||||
return 0x9483;
|
||||
case 0xca:
|
||||
return 0x9c79;
|
||||
case 0xcb:
|
||||
return 0x9eb7;
|
||||
case 0xcc:
|
||||
return 0x7675;
|
||||
case 0xcd:
|
||||
return 0x9a6b;
|
||||
case 0xce:
|
||||
return 0x9c7a;
|
||||
case 0xcf:
|
||||
return 0x9e1d;
|
||||
case 0xd0:
|
||||
return 0x7069;
|
||||
case 0xd1:
|
||||
return 0x706a;
|
||||
case 0xd2:
|
||||
return 0x9ea4;
|
||||
case 0xd3:
|
||||
return 0x9f7e;
|
||||
case 0xd4:
|
||||
return 0x9f49;
|
||||
case 0xd5:
|
||||
return 0x9f98;
|
||||
case 0xd6:
|
||||
return 0x7881;
|
||||
case 0xd7:
|
||||
return 0x92b9;
|
||||
case 0xd8:
|
||||
return 0x88cf;
|
||||
case 0xd9:
|
||||
return 0x58bb;
|
||||
case 0xda:
|
||||
return 0x6052;
|
||||
case 0xdb:
|
||||
return 0x7ca7;
|
||||
case 0xdc:
|
||||
return 0x5afa;
|
||||
case 0xdd:
|
||||
return 0x2554;
|
||||
case 0xde:
|
||||
return 0x2566;
|
||||
case 0xdf:
|
||||
return 0x2557;
|
||||
case 0xe0:
|
||||
return 0x2560;
|
||||
case 0xe1:
|
||||
return 0x256c;
|
||||
case 0xe2:
|
||||
return 0x2563;
|
||||
case 0xe3:
|
||||
return 0x255a;
|
||||
case 0xe4:
|
||||
return 0x2569;
|
||||
case 0xe5:
|
||||
return 0x255d;
|
||||
case 0xe6:
|
||||
return 0x2552;
|
||||
case 0xe7:
|
||||
return 0x2564;
|
||||
case 0xe8:
|
||||
return 0x2555;
|
||||
case 0xe9:
|
||||
return 0x255e;
|
||||
case 0xea:
|
||||
return 0x256a;
|
||||
case 0xeb:
|
||||
return 0x2561;
|
||||
case 0xec:
|
||||
return 0x2558;
|
||||
case 0xed:
|
||||
return 0x2567;
|
||||
case 0xee:
|
||||
return 0x255b;
|
||||
case 0xef:
|
||||
return 0x2553;
|
||||
case 0xf0:
|
||||
return 0x2565;
|
||||
case 0xf1:
|
||||
return 0x2556;
|
||||
case 0xf2:
|
||||
return 0x255f;
|
||||
case 0xf3:
|
||||
return 0x256b;
|
||||
case 0xf4:
|
||||
return 0x2562;
|
||||
case 0xf5:
|
||||
return 0x2559;
|
||||
case 0xf6:
|
||||
return 0x2568;
|
||||
case 0xf7:
|
||||
return 0x255c;
|
||||
case 0xf8:
|
||||
return 0x2551;
|
||||
case 0xf9:
|
||||
return 0x2550;
|
||||
case 0xfa:
|
||||
return 0x256d;
|
||||
case 0xfb:
|
||||
return 0x256e;
|
||||
case 0xfc:
|
||||
return 0x2570;
|
||||
case 0xfd:
|
||||
return 0x256f;
|
||||
case 0xfe:
|
||||
return 0x2593;
|
||||
default:
|
||||
LOGGER.log(POILogger.WARN, "couldn't create char for: f9"
|
||||
+ " " + Integer.toString((trailing & 0xff), 16));
|
||||
return UNMAPPABLE;
|
||||
}
|
||||
}
|
||||
}
|
@ -17,8 +17,6 @@
|
||||
|
||||
package org.apache.poi.util;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
@ -581,26 +579,6 @@ public class StringUtil {
|
||||
' ', // 0xf0ff not defined
|
||||
};
|
||||
|
||||
/**
|
||||
* This tries to convert a LE byte array in Big5 to a String.
|
||||
* We know MS zero-padded ascii, and we drop those.
|
||||
* However, there may be areas for improvement in this.
|
||||
*
|
||||
* @param data
|
||||
* @param offset
|
||||
* @param lengthInBytes
|
||||
* @return
|
||||
*/
|
||||
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
|
||||
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
||||
try {
|
||||
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
|
||||
} catch (IOException e) {
|
||||
logger.log(POILogger.WARN,
|
||||
"IOException while copying a byte array stream to a byte array stream?!");
|
||||
}
|
||||
return new String(os.toByteArray(), BIG5);
|
||||
}
|
||||
|
||||
// Could be replaced with org.apache.commons.lang3.StringUtils#join
|
||||
@Internal
|
||||
|
@ -108,7 +108,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
||||
|
||||
int numChars = textData.length;
|
||||
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
||||
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
||||
numChars /= 2;
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,6 @@
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
|
||||
import org.apache.poi.util.CodePageUtil;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.NotImplemented;
|
||||
|
||||
@ -43,17 +42,6 @@ public class OldTextPiece extends TextPiece {
|
||||
this.rawBytes = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateLengths(int start, int end, int length, PieceDescriptor pd) {
|
||||
//things are still wonky with Big5 char/byte length mapping
|
||||
//sometimes working w/ Java 8 but not w/ Java 7!
|
||||
//for now, if we're dealing w/ Big5 don't bother checking
|
||||
if (pd.getCharset() != null &&
|
||||
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) {
|
||||
return;
|
||||
}
|
||||
super.validateLengths(start, end, length, pd);
|
||||
}
|
||||
/**
|
||||
* @return nothing, ever. Always throws an UnsupportedOperationException
|
||||
* @throws UnsupportedOperationException
|
||||
|
@ -76,7 +76,7 @@ public class OldTextPieceTable extends TextPieceTable {
|
||||
boolean unicode = pieces[x].isUnicode();
|
||||
int multiple = 1;
|
||||
if (unicode ||
|
||||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
|
||||
(charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) {
|
||||
multiple = 2;
|
||||
}
|
||||
|
||||
@ -111,7 +111,7 @@ public class OldTextPieceTable extends TextPieceTable {
|
||||
@Override
|
||||
protected int getEncodingMultiplier(TextPiece textPiece) {
|
||||
Charset charset = textPiece.getPieceDescriptor().getCharset();
|
||||
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
|
||||
if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) {
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
|
@ -20,6 +20,7 @@ package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.util.CodePageUtil;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
@ -60,25 +61,21 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
||||
|
||||
// Validate
|
||||
int textLength = ((CharSequence) _buf).length();
|
||||
validateLengths(start, end, textLength, pd);
|
||||
if (end - start != textLength) {
|
||||
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
|
||||
}
|
||||
if (end < start) {
|
||||
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
|
||||
}
|
||||
}
|
||||
|
||||
protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) {
|
||||
if (end - start != textLength) {
|
||||
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Create the StringBuilder from the text and unicode flag
|
||||
*/
|
||||
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
|
||||
byte[] textBuffer = text;
|
||||
if (StringUtil.BIG5.equals(pd.getCharset())) {
|
||||
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
|
||||
return new StringBuilder(txt);
|
||||
return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length));
|
||||
}
|
||||
|
||||
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
|
||||
|
@ -49,7 +49,6 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
@ -729,7 +728,6 @@ public class TestBugs{
|
||||
* Bug 51944 - PAPFormattedDiskPage.getPAPX - IndexOutOfBounds
|
||||
*/
|
||||
@Test
|
||||
@Ignore("Test now passes in Java 1.7 and 1.8, but not 1.6")
|
||||
public void testBug51944() throws Exception
|
||||
{
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
|
||||
|
@ -247,8 +247,8 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
|
||||
*/
|
||||
assertContains(txt, "\n9-55 xxxxx block5");
|
||||
//TODO: figure out why these two aren't passing
|
||||
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
|
||||
// assertContains(txt, "We are able to");//not sure if we can get this easily?
|
||||
//assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
|
||||
//assertContains(txt, "We are able to");//not sure if we can get this easily?
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,77 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.util;
|
||||
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestLittleEndianCP950Reader {
|
||||
|
||||
@Test
|
||||
public void testPersonalUseMappings() throws Exception {
|
||||
//ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt
|
||||
byte[] data = new byte[2];
|
||||
data[1] = (byte) 0xfe;
|
||||
data[0] = (byte) 0xd3;
|
||||
assertCharEquals('\uE2E5', data);
|
||||
|
||||
data[1] = (byte) 0x90;
|
||||
data[0] = (byte) 0xb6;
|
||||
assertCharEquals('\uE49F', data);
|
||||
|
||||
//actually found in document
|
||||
//but this disagrees with file above
|
||||
data[1] = (byte) 0x8E;
|
||||
data[0] = (byte) 0xA8;
|
||||
assertCharEquals('\uE357', data);
|
||||
|
||||
data[1] = (byte) 0x8E;
|
||||
data[0] = (byte) 0xE6;
|
||||
assertCharEquals('\uE395', data);
|
||||
|
||||
/*
|
||||
//TODO: figure out why this isn't working
|
||||
data[0] = (byte)0xF9;
|
||||
data[1] = (byte)0xD8;
|
||||
assertCharEquals('\u88CF', data);
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void one() {
|
||||
byte b = (byte) 0xfe;
|
||||
byte c = (byte) 0xd3;
|
||||
|
||||
int i = ((b & 0xff) << 8) + (c & 0xff);
|
||||
System.out.println(i);
|
||||
}
|
||||
|
||||
private void assertCharEquals(char expected, byte[] data) throws IOException {
|
||||
Reader reader = new LittleEndianCP950Reader(data);
|
||||
int c = reader.read();
|
||||
assertEquals((int) expected, c);
|
||||
int eof = reader.read();
|
||||
assertEquals("should be end of stream", -1, eof);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user