From 5b59ad3ad16fa76558bfb8274abdbd72705af460 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 17 May 2015 09:39:58 +0000 Subject: [PATCH] * fix Unicode BOM issues --- .../filebot/subtitle/SubtitleUtilities.java | 10 +- source/net/filebot/util/UnicodeReader.java | 101 ++++++++++++++++++ 2 files changed, 107 insertions(+), 4 deletions(-) create mode 100644 source/net/filebot/util/UnicodeReader.java diff --git a/source/net/filebot/subtitle/SubtitleUtilities.java b/source/net/filebot/subtitle/SubtitleUtilities.java index f0f34744..e22089cf 100644 --- a/source/net/filebot/subtitle/SubtitleUtilities.java +++ b/source/net/filebot/subtitle/SubtitleUtilities.java @@ -10,7 +10,7 @@ import static net.filebot.util.FileUtilities.*; import java.io.File; import java.io.IOException; -import java.io.StringReader; +import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; @@ -39,6 +39,8 @@ import net.filebot.similarity.MetricCascade; import net.filebot.similarity.NameSimilarityMetric; import net.filebot.similarity.SequenceMatchSimilarity; import net.filebot.similarity.SimilarityMetric; +import net.filebot.util.ByteBufferInputStream; +import net.filebot.util.UnicodeReader; import net.filebot.vfs.ArchiveType; import net.filebot.vfs.MemoryFile; import net.filebot.web.Movie; @@ -270,13 +272,13 @@ public final class SubtitleUtilities { likelyFormats.addLast(format); } - // decode bytes - String textfile = getText(file.getData()); + // decode bytes and beware of byte-order marks + Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData())); // decode subtitle file with the first reader that seems to work for (SubtitleFormat format : likelyFormats) { // reset reader to position 0 - SubtitleReader parser = format.newReader(new StringReader(textfile)); + SubtitleReader parser = format.newReader(reader); if (parser.hasNext()) { // correct format found diff --git a/source/net/filebot/util/UnicodeReader.java b/source/net/filebot/util/UnicodeReader.java new file mode 100644 index 00000000..4305fdce --- /dev/null +++ b/source/net/filebot/util/UnicodeReader.java @@ -0,0 +1,101 @@ +package net.filebot.util; + +import java.io.*; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +public class UnicodeReader extends Reader { + + private static final int BOM_SIZE = 4; + + private InputStreamReader reader = null; + + public UnicodeReader(InputStream stream) throws IOException { + if (!stream.markSupported()) + throw new IllegalArgumentException("stream must support mark"); + + stream.mark(BOM_SIZE); + byte bom[] = new byte[BOM_SIZE]; + stream.read(bom, 0, bom.length); + + Charset charset = StandardCharsets.UTF_8; + int skip = 0; + + if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { + charset = StandardCharsets.UTF_8; + skip = 3; + } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { + charset = StandardCharsets.UTF_16BE; + skip = 2; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { + charset = StandardCharsets.UTF_16LE; + skip = 2; + } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { + charset = Charset.forName("UTF-32BE"); + skip = 4; + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { + charset = Charset.forName("UTF-32LE"); + skip = 4; + } + + stream.reset(); + stream.skip(skip); + + // initialize reader + reader = new InputStreamReader(stream, charset); + } + + public int hashCode() { + return reader.hashCode(); + } + + public int read(CharBuffer target) throws IOException { + return reader.read(target); + } + + public boolean equals(Object obj) { + return reader.equals(obj); + } + + public int read(char[] cbuf) throws IOException { + return reader.read(cbuf); + } + + public String getEncoding() { + return reader.getEncoding(); + } + + public int read() throws IOException { + return reader.read(); + } + + public int read(char[] cbuf, int offset, int length) throws IOException { + return reader.read(cbuf, offset, length); + } + + public long skip(long n) throws IOException { + return reader.skip(n); + } + + public boolean ready() throws IOException { + return reader.ready(); + } + + public void close() throws IOException { + reader.close(); + } + + public boolean markSupported() { + return reader.markSupported(); + } + + public void mark(int readAheadLimit) throws IOException { + reader.mark(readAheadLimit); + } + + public void reset() throws IOException { + reader.reset(); + } + +}