* auto-detect encoding if necessary

This commit is contained in:
Reinhard Pointner 2015-05-17 10:18:37 +00:00
parent 5b59ad3ad1
commit 545224396a
1 changed files with 19 additions and 13 deletions

View File

@ -5,11 +5,14 @@ import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private InputStreamReader reader = null;
private final Reader reader;
public UnicodeReader(InputStream stream) throws IOException {
if (!stream.markSupported())
@ -19,31 +22,38 @@ public class UnicodeReader extends Reader {
byte bom[] = new byte[BOM_SIZE];
stream.read(bom, 0, bom.length);
Charset charset = StandardCharsets.UTF_8;
Charset bomEncoding = null;
int skip = 0;
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
charset = StandardCharsets.UTF_8;
bomEncoding = StandardCharsets.UTF_8;
skip = 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
charset = StandardCharsets.UTF_16BE;
bomEncoding = StandardCharsets.UTF_16BE;
skip = 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
charset = StandardCharsets.UTF_16LE;
bomEncoding = StandardCharsets.UTF_16LE;
skip = 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
charset = Charset.forName("UTF-32BE");
bomEncoding = Charset.forName("UTF-32BE");
skip = 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
charset = Charset.forName("UTF-32LE");
bomEncoding = Charset.forName("UTF-32LE");
skip = 4;
}
// rewind and skip BOM
stream.reset();
stream.skip(skip);
// initialize reader
reader = new InputStreamReader(stream, charset);
// guess character encoding if necessary
if (bomEncoding == null) {
// auto-detect encoding
reader = new CharsetDetector().getReader(stream, "UTF-8");
} else {
// initialize reader via BOM
reader = new InputStreamReader(stream, bomEncoding);
}
}
public int hashCode() {
@ -62,10 +72,6 @@ public class UnicodeReader extends Reader {
return reader.read(cbuf);
}
public String getEncoding() {
return reader.getEncoding();
}
public int read() throws IOException {
return reader.read();
}