mirror of
https://github.com/mitb-archive/filebot
synced 2024-12-25 09:18:51 -05:00
* auto-detect encoding if necessary
This commit is contained in:
parent
5b59ad3ad1
commit
545224396a
@ -5,11 +5,14 @@ import java.nio.CharBuffer;
|
|||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
import com.ibm.icu.text.CharsetDetector;
|
||||||
|
import com.ibm.icu.text.CharsetMatch;
|
||||||
|
|
||||||
public class UnicodeReader extends Reader {
|
public class UnicodeReader extends Reader {
|
||||||
|
|
||||||
private static final int BOM_SIZE = 4;
|
private static final int BOM_SIZE = 4;
|
||||||
|
|
||||||
private InputStreamReader reader = null;
|
private final Reader reader;
|
||||||
|
|
||||||
public UnicodeReader(InputStream stream) throws IOException {
|
public UnicodeReader(InputStream stream) throws IOException {
|
||||||
if (!stream.markSupported())
|
if (!stream.markSupported())
|
||||||
@ -19,31 +22,38 @@ public class UnicodeReader extends Reader {
|
|||||||
byte bom[] = new byte[BOM_SIZE];
|
byte bom[] = new byte[BOM_SIZE];
|
||||||
stream.read(bom, 0, bom.length);
|
stream.read(bom, 0, bom.length);
|
||||||
|
|
||||||
Charset charset = StandardCharsets.UTF_8;
|
Charset bomEncoding = null;
|
||||||
int skip = 0;
|
int skip = 0;
|
||||||
|
|
||||||
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
|
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
|
||||||
charset = StandardCharsets.UTF_8;
|
bomEncoding = StandardCharsets.UTF_8;
|
||||||
skip = 3;
|
skip = 3;
|
||||||
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
|
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
|
||||||
charset = StandardCharsets.UTF_16BE;
|
bomEncoding = StandardCharsets.UTF_16BE;
|
||||||
skip = 2;
|
skip = 2;
|
||||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
|
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
|
||||||
charset = StandardCharsets.UTF_16LE;
|
bomEncoding = StandardCharsets.UTF_16LE;
|
||||||
skip = 2;
|
skip = 2;
|
||||||
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
|
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
|
||||||
charset = Charset.forName("UTF-32BE");
|
bomEncoding = Charset.forName("UTF-32BE");
|
||||||
skip = 4;
|
skip = 4;
|
||||||
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
|
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
|
||||||
charset = Charset.forName("UTF-32LE");
|
bomEncoding = Charset.forName("UTF-32LE");
|
||||||
skip = 4;
|
skip = 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rewind and skip BOM
|
||||||
stream.reset();
|
stream.reset();
|
||||||
stream.skip(skip);
|
stream.skip(skip);
|
||||||
|
|
||||||
// initialize reader
|
// guess character encoding if necessary
|
||||||
reader = new InputStreamReader(stream, charset);
|
if (bomEncoding == null) {
|
||||||
|
// auto-detect encoding
|
||||||
|
reader = new CharsetDetector().getReader(stream, "UTF-8");
|
||||||
|
} else {
|
||||||
|
// initialize reader via BOM
|
||||||
|
reader = new InputStreamReader(stream, bomEncoding);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
@ -62,10 +72,6 @@ public class UnicodeReader extends Reader {
|
|||||||
return reader.read(cbuf);
|
return reader.read(cbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getEncoding() {
|
|
||||||
return reader.getEncoding();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int read() throws IOException {
|
public int read() throws IOException {
|
||||||
return reader.read();
|
return reader.read();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user