* fix Unicode BOM issues

2015-05-17 09:39:58 +00:00 · 2015-05-17 09:39:58 +00:00 · 5b59ad3ad1
parent 1c99bd44a2
commit 5b59ad3ad1
2 changed files with 107 additions and 4 deletions
--- a/source/net/filebot/subtitle/SubtitleUtilities.java
+++ b/source/net/filebot/subtitle/SubtitleUtilities.java
@ -10,7 +10,7 @@ import static net.filebot.util.FileUtilities.*;

 import java.io.File;
 import java.io.IOException;
-import java.io.StringReader;
+import java.io.Reader;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
@ -39,6 +39,8 @@ import net.filebot.similarity.MetricCascade;
 import net.filebot.similarity.NameSimilarityMetric;
 import net.filebot.similarity.SequenceMatchSimilarity;
 import net.filebot.similarity.SimilarityMetric;
+import net.filebot.util.ByteBufferInputStream;
+import net.filebot.util.UnicodeReader;
 import net.filebot.vfs.ArchiveType;
 import net.filebot.vfs.MemoryFile;
 import net.filebot.web.Movie;
@ -270,13 +272,13 @@ public final class SubtitleUtilities {
 				likelyFormats.addLast(format);
 		}

-		// decode bytes
-		String textfile = getText(file.getData());
+		// decode bytes and beware of byte-order marks
+		Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()));

 		// decode subtitle file with the first reader that seems to work
 		for (SubtitleFormat format : likelyFormats) {
 			// reset reader to position 0
-			SubtitleReader parser = format.newReader(new StringReader(textfile));
+			SubtitleReader parser = format.newReader(reader);

 			if (parser.hasNext()) {
 				// correct format found
--- a/source/net/filebot/util/UnicodeReader.java
+++ b/source/net/filebot/util/UnicodeReader.java
@ -0,0 +1,101 @@
+package net.filebot.util;
+
+import java.io.*;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+public class UnicodeReader extends Reader {
+
+	private static final int BOM_SIZE = 4;
+
+	private InputStreamReader reader = null;
+
+	public UnicodeReader(InputStream stream) throws IOException {
+		if (!stream.markSupported())
+			throw new IllegalArgumentException("stream must support mark");
+
+		stream.mark(BOM_SIZE);
+		byte bom[] = new byte[BOM_SIZE];
+		stream.read(bom, 0, bom.length);
+
+		Charset charset = StandardCharsets.UTF_8;
+		int skip = 0;
+
+		if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
+			charset = StandardCharsets.UTF_8;
+			skip = 3;
+		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+			charset = StandardCharsets.UTF_16BE;
+			skip = 2;
+		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+			charset = StandardCharsets.UTF_16LE;
+			skip = 2;
+		} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
+			charset = Charset.forName("UTF-32BE");
+			skip = 4;
+		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
+			charset = Charset.forName("UTF-32LE");
+			skip = 4;
+		}
+
+		stream.reset();
+		stream.skip(skip);
+
+		// initialize reader
+		reader = new InputStreamReader(stream, charset);
+	}
+
+	public int hashCode() {
+		return reader.hashCode();
+	}
+
+	public int read(CharBuffer target) throws IOException {
+		return reader.read(target);
+	}
+
+	public boolean equals(Object obj) {
+		return reader.equals(obj);
+	}
+
+	public int read(char[] cbuf) throws IOException {
+		return reader.read(cbuf);
+	}
+
+	public String getEncoding() {
+		return reader.getEncoding();
+	}
+
+	public int read() throws IOException {
+		return reader.read();
+	}
+
+	public int read(char[] cbuf, int offset, int length) throws IOException {
+		return reader.read(cbuf, offset, length);
+	}
+
+	public long skip(long n) throws IOException {
+		return reader.skip(n);
+	}
+
+	public boolean ready() throws IOException {
+		return reader.ready();
+	}
+
+	public void close() throws IOException {
+		reader.close();
+	}
+
+	public boolean markSupported() {
+		return reader.markSupported();
+	}
+
+	public void mark(int readAheadLimit) throws IOException {
+		reader.mark(readAheadLimit);
+	}
+
+	public void reset() throws IOException {
+		reader.reset();
+	}
+
+}