Refactor UnicodeReader and BOM detection

This commit is contained in:
Reinhard Pointner 2016-11-21 01:56:43 +08:00
parent 53376c3de6
commit 8932eb0b2a
8 changed files with 137 additions and 185 deletions

View File

@ -9,7 +9,7 @@
<classpathentry kind="lib" path="lib/jars/xmlrpc.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/ehcache.jar" sourcepath="lib/ivy/source/ehcache.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/glazedlists_java15.jar" sourcepath="lib/ivy/source/glazedlists_java15.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/icu4j.jar" sourcepath="lib/ivy/source/icu4j.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/jna.jar" sourcepath="lib/ivy/source/jna.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/junit.jar"/>
<classpathentry kind="lib" path="lib/ivy/jar/miglayout-core.jar"/>

View File

@ -1219,7 +1219,7 @@ public class MediaDetection {
// parse ids from nfo files
for (File nfo : nfoFiles) {
try {
String text = new String(readFile(nfo), "UTF-8");
String text = readTextFile(nfo);
collection.addAll(grepImdbId(text));
} catch (Exception e) {
debug.warning("Failed to read nfo: " + e.getMessage());
@ -1246,7 +1246,7 @@ public class MediaDetection {
continue;
for (File nfo : getChildren(folder, NFO_FILES)) {
String text = new String(readFile(nfo), "UTF-8");
String text = readTextFile(nfo);
for (int imdbid : grepImdbId(text)) {
SearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);

View File

@ -1,5 +1,6 @@
package net.filebot.subtitle;
import static java.nio.charset.StandardCharsets.*;
import static java.util.Collections.*;
import static java.util.stream.Collectors.*;
import static net.filebot.Logging.*;
@ -10,11 +11,10 @@ import static net.filebot.util.FileUtilities.*;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
@ -33,6 +33,8 @@ import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
@ -53,7 +55,7 @@ import net.filebot.similarity.SequenceMatchSimilarity;
import net.filebot.similarity.SimilarityComparator;
import net.filebot.similarity.SimilarityMetric;
import net.filebot.util.ByteBufferInputStream;
import net.filebot.util.UnicodeReader;
import net.filebot.util.ByteBufferOutputStream;
import net.filebot.vfs.ArchiveType;
import net.filebot.vfs.MemoryFile;
import net.filebot.web.Movie;
@ -325,7 +327,7 @@ public final class SubtitleUtilities {
// decode subtitle file with the first reader that seems to work
for (SubtitleFormat format : likelyFormats) {
// decode bytes and beware of byte-order marks
Reader reader = new UnicodeReader(new ByteBufferInputStream(file.getData()), true, StandardCharsets.UTF_8);
Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
// reset reader to position 0
SubtitleReader parser = format.newReader(reader);
@ -347,29 +349,31 @@ public final class SubtitleUtilities {
throw new IOException("Subtitle format not supported");
}
public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
public static ByteBuffer exportSubtitles(MemoryFile file, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
throw new IllegalArgumentException("Format not supported");
}
// convert to target format and target encoding
ByteBufferOutputStream buffer = new ByteBufferOutputStream(file.size());
OutputStreamWriter writer = new OutputStreamWriter(buffer, outputEncoding);
if (outputFormat == SubtitleFormat.SubRip) {
// output buffer
StringBuilder buffer = new StringBuilder(4 * 1024);
try (SubRipWriter out = new SubRipWriter(buffer)) {
for (SubtitleElement it : decodeSubtitles(data)) {
// convert to target format and target encoding
try (SubRipWriter out = new SubRipWriter(writer)) {
for (SubtitleElement it : decodeSubtitles(file)) {
if (outputTimingOffset != 0) {
it = new SubtitleElement(Math.max(0, it.getStart() + outputTimingOffset), Math.max(0, it.getEnd() + outputTimingOffset), it.getText());
}
out.write(it);
}
}
return outputEncoding.encode(CharBuffer.wrap(buffer));
} else {
// convert only text encoding
Reader reader = createTextReader(new ByteBufferInputStream(file.getData()), true, UTF_8);
IOUtils.copy(reader, writer);
}
// only change encoding
return outputEncoding.encode(getText(data.getData()));
return buffer.getByteBuffer();
}
public static SubtitleFormat getSubtitleFormat(File file) {

View File

@ -3,6 +3,7 @@ package net.filebot.ui.subtitle.upload;
import static java.util.Collections.*;
import static net.filebot.Logging.*;
import static net.filebot.media.MediaDetection.*;
import static net.filebot.util.FileUtilities.*;
import static net.filebot.util.ui.SwingUI.*;
import java.awt.Color;
@ -32,7 +33,6 @@ import net.filebot.Language;
import net.filebot.ResourceManager;
import net.filebot.WebServices;
import net.filebot.media.MediaDetection;
import net.filebot.util.FileUtilities;
import net.filebot.util.ui.EmptySelectionModel;
import net.filebot.web.Movie;
import net.filebot.web.OpenSubtitlesClient;
@ -151,7 +151,7 @@ public class SubtitleUploadDialog extends JDialog {
if (mapping.getLanguage() == null) {
mapping.setState(Status.Identifying);
try {
Locale locale = database.detectLanguage(FileUtilities.readFile(mapping.getSubtitle()));
Locale locale = database.detectLanguage(readFile(mapping.getSubtitle()));
mapping.setLanguage(Language.getLanguage(locale));
} catch (Exception e) {
debug.log(Level.WARNING, "Failed to auto-detect language: " + e.getMessage());

View File

@ -0,0 +1,73 @@
package net.filebot.util;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
public enum BOM {
UTF_8((byte) 0xEF, (byte) 0xBB, (byte) 0xBF),
UTF_16BE((byte) 0xFE, (byte) 0xFF),
UTF_16LE((byte) 0xFF, (byte) 0xFE),
UTF_32BE((byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF),
UTF_32LE((byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00),
GB_18030((byte) 0x84, (byte) 0x31, (byte) 0x95, (byte) 0x33);
public static final int SIZE = 4;
private byte[] bom;
BOM(byte... bom) {
this.bom = bom;
}
public int size() {
return bom.length;
}
public boolean matches(byte[] bytes) {
if (bytes.length < bom.length) {
return false;
}
for (int i = 0; i < bom.length; i++) {
if (bom[i] != bytes[i]) {
return false;
}
}
return true;
}
public Charset getCharset() {
switch (this) {
case UTF_8:
return StandardCharsets.UTF_8;
case UTF_16BE:
return StandardCharsets.UTF_16BE;
case UTF_16LE:
return StandardCharsets.UTF_16LE;
case UTF_32BE:
return Charset.forName("UTF-32BE");
case UTF_32LE:
return Charset.forName("UTF-32LE");
case GB_18030:
return Charset.forName("GB18030");
}
return null;
}
public static BOM detect(byte[] bytes) {
for (BOM bom : values()) {
if (bom.matches(bytes)) {
return bom;
}
}
return null;
}
}

View File

@ -4,23 +4,22 @@ import static java.nio.charset.StandardCharsets.*;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static java.util.Comparator.*;
import static java.util.stream.Collectors.*;
import static net.filebot.Logging.*;
import static net.filebot.util.RegularExpressions.*;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.AtomicMoveNotSupportedException;
import java.nio.file.FileVisitOption;
@ -48,7 +47,6 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Stream;
import org.apache.commons.io.FileUtils;
@ -196,18 +194,21 @@ public final class FileUtilities {
return Files.readAllBytes(file.toPath());
}
public static <R, A> R readLines(File file, Collector<? super String, A, R> collector) throws IOException {
try (BufferedReader reader = new BufferedReader(new UnicodeReader(new ByteArrayInputStream(readFile(file)), false, UTF_8))) {
return reader.lines().collect(collector);
public static String readTextFile(File file) throws IOException {
byte[] bytes = readFile(file);
// check BOM
BOM bom = BOM.detect(bytes);
if (bom != null) {
return new String(bytes, bom.size(), bytes.length - bom.size(), bom.getCharset());
} else {
return new String(bytes, UTF_8);
}
}
public static List<String> readLines(File file) throws IOException {
return readLines(file, toList());
}
public static String readTextFile(File file) throws IOException {
return readLines(file, joining(System.lineSeparator()));
return asList(NEWLINE.split(readTextFile(file)));
}
public static File writeFile(ByteBuffer data, File destination) throws IOException {
@ -217,35 +218,37 @@ public final class FileUtilities {
return destination;
}
public static Reader createTextReader(File file) throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
detector.setText(new BufferedInputStream(new FileInputStream(file)));
public static Reader createTextReader(InputStream in, boolean guess, Charset declaredEncoding) throws IOException {
byte head[] = new byte[BOM.SIZE];
in.mark(head.length);
in.read(head);
in.reset(); // rewind
CharsetMatch charset = detector.detect();
if (charset != null)
return charset.getReader();
// check BOM
BOM bom = BOM.detect(head);
// assume UTF-8 by default
return new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8);
}
if (bom != null) {
in.skip(bom.size()); // skip BOM
return new InputStreamReader(in, bom.getCharset());
}
public static String getText(ByteBuffer data) throws IOException {
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding("UTF-8"); // small boost for UTF-8 as default encoding
detector.setText(new ByteBufferInputStream(data));
CharsetMatch charset = detector.detect();
if (charset != null) {
try {
return charset.getString();
} catch (RuntimeException e) {
throw new IOException("Failed to read text", e);
// auto-detect character encoding
if (guess) {
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding(declaredEncoding.name());
detector.setText(in);
CharsetMatch match = detector.detect();
if (match != null) {
return match.getReader();
}
}
// assume UTF-8 by default
return UTF_8.decode(data).toString();
// default to declared encoding
return new InputStreamReader(in, declaredEncoding);
}
public static Reader createTextReader(File file) throws IOException {
return createTextReader(new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE), true, UTF_8);
}
public static boolean equalsCaseSensitive(File a, File b) {

View File

@ -1,125 +0,0 @@
package net.filebot.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import com.ibm.icu.text.CharsetDetector;
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final Reader reader;
public UnicodeReader(InputStream stream, boolean guessCharset, Charset defaultCharset) throws IOException {
if (!stream.markSupported()) {
throw new IllegalArgumentException("stream must support mark");
}
stream.mark(BOM_SIZE);
byte bom[] = new byte[BOM_SIZE];
stream.read(bom, 0, bom.length);
Charset bomEncoding = null;
int skip = 0;
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
bomEncoding = StandardCharsets.UTF_8;
skip = 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
bomEncoding = StandardCharsets.UTF_16BE;
skip = 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
bomEncoding = StandardCharsets.UTF_16LE;
skip = 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
bomEncoding = Charset.forName("UTF-32BE");
skip = 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
bomEncoding = Charset.forName("UTF-32LE");
skip = 4;
}
// rewind and skip BOM
stream.reset();
stream.skip(skip);
// guess character encoding if necessary
if (bomEncoding != null) {
// initialize reader via BOM
reader = new InputStreamReader(stream, bomEncoding);
} else if (bomEncoding == null && guessCharset) {
// auto-detect encoding
reader = new CharsetDetector().getReader(stream, defaultCharset.name());
} else {
// use default
reader = new InputStreamReader(stream, defaultCharset);
}
}
@Override
public int hashCode() {
return reader.hashCode();
}
@Override
public int read(CharBuffer target) throws IOException {
return reader.read(target);
}
@Override
public boolean equals(Object obj) {
return reader.equals(obj);
}
@Override
public int read(char[] cbuf) throws IOException {
return reader.read(cbuf);
}
@Override
public int read() throws IOException {
return reader.read();
}
@Override
public int read(char[] cbuf, int offset, int length) throws IOException {
return reader.read(cbuf, offset, length);
}
@Override
public long skip(long n) throws IOException {
return reader.skip(n);
}
@Override
public boolean ready() throws IOException {
return reader.ready();
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public boolean markSupported() {
return reader.markSupported();
}
@Override
public void mark(int readAheadLimit) throws IOException {
reader.mark(readAheadLimit);
}
@Override
public void reset() throws IOException {
reader.reset();
}
}

View File

@ -1,39 +1,36 @@
package net.filebot.vfs;
import java.nio.ByteBuffer;
public class MemoryFile {
private final String path;
private final ByteBuffer data;
public MemoryFile(String path, ByteBuffer data) {
// normalize folder separator
this.path = path.replace('\\', '/');
this.data = data;
}
public String getName() {
return path.substring(path.lastIndexOf("/") + 1);
}
public String getPath() {
return path;
}
public int size() {
return data.remaining();
}
public ByteBuffer getData() {
return data.duplicate();
}
@Override
public String toString() {
return path;