From 9e4b38ea9a00d201df71dbf70eff7a24a60ddade Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Thu, 2 Jan 2014 14:56:10 +0000 Subject: [PATCH] * optimize subtitle collection --- .../filebot/cli/CmdlineOperations.java | 2 +- .../filebot/similarity/MetricCascade.java | 29 +++-- .../filebot/subtitle/SubtitleUtilities.java | 112 ++++++++---------- .../ui/subtitle/SubtitleAutoMatchDialog.java | 7 +- 4 files changed, 72 insertions(+), 78 deletions(-) diff --git a/source/net/sourceforge/filebot/cli/CmdlineOperations.java b/source/net/sourceforge/filebot/cli/CmdlineOperations.java index fd0a5c37..2e7402d1 100644 --- a/source/net/sourceforge/filebot/cli/CmdlineOperations.java +++ b/source/net/sourceforge/filebot/cli/CmdlineOperations.java @@ -828,7 +828,7 @@ public class CmdlineOperations implements CmdlineInterface { private Map lookupSubtitleByFileName(SubtitleProvider service, Collection querySet, Language language, Collection videoFiles, boolean strict) throws Exception { // search for subtitles - List subtitles = findSubtitles(service, querySet, language.getName()); + Set subtitles = findSubtitles(service, querySet, language.getName()); // match subtitle files to video files if (subtitles.size() > 0) { diff --git a/source/net/sourceforge/filebot/similarity/MetricCascade.java b/source/net/sourceforge/filebot/similarity/MetricCascade.java index 34aa8499..8df70885 100644 --- a/source/net/sourceforge/filebot/similarity/MetricCascade.java +++ b/source/net/sourceforge/filebot/similarity/MetricCascade.java @@ -1,19 +1,20 @@ - package net.sourceforge.filebot.similarity; - import static java.lang.Math.*; - public class MetricCascade implements SimilarityMetric { - + private final SimilarityMetric[] cascade; - + private final boolean shortCircuit; public MetricCascade(SimilarityMetric... cascade) { - this.cascade = cascade; + this(true, cascade); + } + + public MetricCascade(boolean shortCircuit, SimilarityMetric... cascade) { + this.cascade = cascade; + this.shortCircuit = shortCircuit; } - @Override public float getSimilarity(Object o1, Object o2) { @@ -22,16 +23,20 @@ public class MetricCascade implements SimilarityMetric { float similarity = metric.getSimilarity(o1, o2); if (abs(similarity) >= abs(f)) { // perfect match, ignore remaining metrics - if (similarity >= 1) { - return similarity; + if (shortCircuit) { + if (similarity >= 1) { + return similarity; + } else if (similarity <= -1) { + return similarity; + } } - + // possible match or perfect negative match f = similarity; } } - + return f; } - + } diff --git a/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java b/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java index 5f9d0131..14c8256a 100644 --- a/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java +++ b/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java @@ -1,7 +1,5 @@ - package net.sourceforge.filebot.subtitle; - import static java.lang.Math.*; import static net.sourceforge.filebot.MediaTypes.*; import static net.sourceforge.filebot.media.MediaDetection.*; @@ -26,6 +24,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import net.sourceforge.filebot.Language; import net.sourceforge.filebot.similarity.Match; import net.sourceforge.filebot.similarity.Matcher; import net.sourceforge.filebot.similarity.MetricAvg; @@ -33,25 +32,23 @@ import net.sourceforge.filebot.similarity.MetricCascade; import net.sourceforge.filebot.similarity.NameSimilarityMetric; import net.sourceforge.filebot.similarity.SequenceMatchSimilarity; import net.sourceforge.filebot.similarity.SimilarityMetric; -import net.sourceforge.filebot.Language; import net.sourceforge.filebot.vfs.ArchiveType; import net.sourceforge.filebot.vfs.MemoryFile; import net.sourceforge.filebot.web.SearchResult; import net.sourceforge.filebot.web.SubtitleDescriptor; import net.sourceforge.filebot.web.SubtitleProvider; - public final class SubtitleUtilities { - + public static Map matchSubtitles(Collection files, Collection subtitles, boolean strict) throws InterruptedException { Map subtitleByVideo = new LinkedHashMap(); - + // optimize for generic media <-> subtitle matching SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, new NameSimilarityMetric() }; - - // subtitle verification metric specifically excluding SxE mismatches + + // subtitle verification metric specifically excluding SxE mismatches SimilarityMetric absoluteSeasonEpisode = new SimilarityMetric() { - + @Override public float getSimilarity(Object o1, Object o2) { float f = SeasonEpisode.getSimilarity(o1, o2); @@ -62,175 +59,167 @@ public final class SubtitleUtilities { } }; SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name), getMovieMatchMetric()); - + // first match everything as best as possible, then filter possibly bad matches Matcher matcher = new Matcher(files, subtitles, false, metrics); - + for (Match it : matcher.match()) { if (sanity.getSimilarity(it.getValue(), it.getCandidate()) >= (strict ? 0.9f : 0.6f)) { subtitleByVideo.put(it.getValue(), it.getCandidate()); } } - + return subtitleByVideo; } - - - public static List findSubtitles(SubtitleProvider service, Collection querySet, String languageName) throws Exception { - List subtitles = new ArrayList(); - + + public static Set findSubtitles(SubtitleProvider service, Collection querySet, String languageName) throws Exception { + Set subtitles = new LinkedHashSet(); + // search for and automatically select movie / show entry Set resultSet = new HashSet(); for (String query : querySet) { resultSet.addAll(findProbableSearchResults(query, service.search(query))); } - + // fetch subtitles for all search results for (SearchResult it : resultSet) { subtitles.addAll(service.getSubtitleList(it, languageName)); } - + return subtitles; } - - + protected static Collection findProbableSearchResults(String query, Iterable searchResults) { // auto-select most probable search result Set probableMatches = new LinkedHashSet(); - + // use name similarity metric SimilarityMetric metric = new MetricAvg(new SequenceMatchSimilarity(), new NameSimilarityMetric()); - + // find probable matches using name similarity > threshold for (SearchResult result : searchResults) { if (metric.getSimilarity(query, removeTrailingBrackets(result.getName())) > 0.8f || result.getName().toLowerCase().startsWith(query.toLowerCase())) { probableMatches.add(result); } } - + return probableMatches; } - - + /** * Detect charset and parse subtitle file even if extension is invalid */ public static List decodeSubtitles(MemoryFile file) throws IOException { // gather all formats, put likely formats first LinkedList likelyFormats = new LinkedList(); - + for (SubtitleFormat format : SubtitleFormat.values()) { if (format.getFilter().accept(file.getName())) likelyFormats.addFirst(format); else likelyFormats.addLast(format); } - + // decode bytes String textfile = getText(file.getData()); - + // decode subtitle file with the first reader that seems to work for (SubtitleFormat format : likelyFormats) { // reset reader to position 0 SubtitleReader parser = format.newReader(new StringReader(textfile)); - + if (parser.hasNext()) { // correct format found List list = new ArrayList(500); - + // read subtitle file while (parser.hasNext()) { list.add(parser.next()); } - + return list; } } - + // unsupported subtitle format throw new IOException("Cannot read subtitle format"); } - - + public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException { if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) { throw new IllegalArgumentException("Format not supported"); } - + // convert to target format and target encoding if (outputFormat == SubtitleFormat.SubRip) { // output buffer StringBuilder buffer = new StringBuilder(4 * 1024); SubRipWriter out = new SubRipWriter(buffer); - + for (SubtitleElement it : decodeSubtitles(data)) { if (outputTimingOffset != 0) it = new SubtitleElement(max(0, it.getStart() + outputTimingOffset), max(0, it.getEnd() + outputTimingOffset), it.getText()); - + out.write(it); } - + return outputEncoding.encode(CharBuffer.wrap(buffer)); } - + // only change encoding return outputEncoding.encode(getText(data.getData())); } - - + public static SubtitleFormat getSubtitleFormat(File file) { for (SubtitleFormat it : SubtitleFormat.values()) { if (it.getFilter().accept(file)) return it; } - + return null; } - - + public static SubtitleFormat getSubtitleFormatByName(String name) { for (SubtitleFormat it : SubtitleFormat.values()) { // check by name if (it.name().equalsIgnoreCase(name)) return it; - + // check by extension if (it.getFilter().acceptExtension(name)) return it; } - + return null; } - - + public static String formatSubtitle(String name, String languageName, String type) { StringBuilder sb = new StringBuilder(name); - + if (languageName != null) { String lang = Language.getISO3LanguageCodeByName(languageName); - + if (lang == null) { // we probably won't get here, but just in case lang = languageName.replaceAll("\\W", ""); } - + sb.append('.').append(lang); } - + if (type != null) { sb.append('.').append(type); } - + return sb.toString(); } - - + public static MemoryFile fetchSubtitle(SubtitleDescriptor descriptor) throws Exception { ByteBuffer data = descriptor.fetch(); - + // extract subtitles from archive ArchiveType type = ArchiveType.forName(descriptor.getType()); - + if (type != ArchiveType.UNKOWN) { // extract subtitle from archive Iterator it = type.fromData(data).iterator(); @@ -241,17 +230,16 @@ public final class SubtitleUtilities { } } } - + // assume that the fetched data is the subtitle return new MemoryFile(descriptor.getPath(), data); } - - + /** * Dummy constructor to prevent instantiation. */ private SubtitleUtilities() { throw new UnsupportedOperationException(); } - + } diff --git a/source/net/sourceforge/filebot/ui/subtitle/SubtitleAutoMatchDialog.java b/source/net/sourceforge/filebot/ui/subtitle/SubtitleAutoMatchDialog.java index 2b53adcd..3e1f6ef4 100644 --- a/source/net/sourceforge/filebot/ui/subtitle/SubtitleAutoMatchDialog.java +++ b/source/net/sourceforge/filebot/ui/subtitle/SubtitleAutoMatchDialog.java @@ -892,7 +892,7 @@ class SubtitleAutoMatchDialog extends JDialog { @Override public String getName() { - return String.format("%s (by hash)", service.getName()); + return String.format("%s [via hash]", service.getName()); } @Override @@ -919,7 +919,7 @@ class SubtitleAutoMatchDialog extends JDialog { @Override public String getName() { - return String.format("%s (by name)", service.getName()); + return String.format("%s [via name]", service.getName()); } @Override @@ -946,7 +946,7 @@ class SubtitleAutoMatchDialog extends JDialog { } } - List subtitles = findSubtitles(service, querySet, languageName); + Set subtitles = findSubtitles(service, querySet, languageName); // dialog may have been cancelled by now if (Thread.interrupted()) { @@ -978,6 +978,7 @@ class SubtitleAutoMatchDialog extends JDialog { SimilarityMetric sanity = EpisodeMetrics.verificationMetric(); float minMatchSimilarity = 0.5f; + // this could be very slow, lets hope at this point there is not much left due to positive hash matches for (File file : files) { // add matching subtitles for (SubtitleDescriptor it : subtitles) {