* optimize subtitle collection

This commit is contained in:
Reinhard Pointner 2014-01-02 14:56:10 +00:00
parent 060229757a
commit 9e4b38ea9a
4 changed files with 72 additions and 78 deletions

View File

@ -828,7 +828,7 @@ public class CmdlineOperations implements CmdlineInterface {
private Map<File, SubtitleDescriptor> lookupSubtitleByFileName(SubtitleProvider service, Collection<String> querySet, Language language, Collection<File> videoFiles, boolean strict) throws Exception {
// search for subtitles
List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, language.getName());
Set<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, language.getName());
// match subtitle files to video files
if (subtitles.size() > 0) {

View File

@ -1,19 +1,20 @@
package net.sourceforge.filebot.similarity;
import static java.lang.Math.*;
public class MetricCascade implements SimilarityMetric {
private final SimilarityMetric[] cascade;
private final boolean shortCircuit;
public MetricCascade(SimilarityMetric... cascade) {
this.cascade = cascade;
this(true, cascade);
}
public MetricCascade(boolean shortCircuit, SimilarityMetric... cascade) {
this.cascade = cascade;
this.shortCircuit = shortCircuit;
}
@Override
public float getSimilarity(Object o1, Object o2) {
@ -22,16 +23,20 @@ public class MetricCascade implements SimilarityMetric {
float similarity = metric.getSimilarity(o1, o2);
if (abs(similarity) >= abs(f)) {
// perfect match, ignore remaining metrics
if (similarity >= 1) {
return similarity;
if (shortCircuit) {
if (similarity >= 1) {
return similarity;
} else if (similarity <= -1) {
return similarity;
}
}
// possible match or perfect negative match
f = similarity;
}
}
return f;
}
}

View File

@ -1,7 +1,5 @@
package net.sourceforge.filebot.subtitle;
import static java.lang.Math.*;
import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.filebot.media.MediaDetection.*;
@ -26,6 +24,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import net.sourceforge.filebot.Language;
import net.sourceforge.filebot.similarity.Match;
import net.sourceforge.filebot.similarity.Matcher;
import net.sourceforge.filebot.similarity.MetricAvg;
@ -33,25 +32,23 @@ import net.sourceforge.filebot.similarity.MetricCascade;
import net.sourceforge.filebot.similarity.NameSimilarityMetric;
import net.sourceforge.filebot.similarity.SequenceMatchSimilarity;
import net.sourceforge.filebot.similarity.SimilarityMetric;
import net.sourceforge.filebot.Language;
import net.sourceforge.filebot.vfs.ArchiveType;
import net.sourceforge.filebot.vfs.MemoryFile;
import net.sourceforge.filebot.web.SearchResult;
import net.sourceforge.filebot.web.SubtitleDescriptor;
import net.sourceforge.filebot.web.SubtitleProvider;
public final class SubtitleUtilities {
public static Map<File, SubtitleDescriptor> matchSubtitles(Collection<File> files, Collection<SubtitleDescriptor> subtitles, boolean strict) throws InterruptedException {
Map<File, SubtitleDescriptor> subtitleByVideo = new LinkedHashMap<File, SubtitleDescriptor>();
// optimize for generic media <-> subtitle matching
SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, new NameSimilarityMetric() };
// subtitle verification metric specifically excluding SxE mismatches
// subtitle verification metric specifically excluding SxE mismatches
SimilarityMetric absoluteSeasonEpisode = new SimilarityMetric() {
@Override
public float getSimilarity(Object o1, Object o2) {
float f = SeasonEpisode.getSimilarity(o1, o2);
@ -62,175 +59,167 @@ public final class SubtitleUtilities {
}
};
SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name), getMovieMatchMetric());
// first match everything as best as possible, then filter possibly bad matches
Matcher<File, SubtitleDescriptor> matcher = new Matcher<File, SubtitleDescriptor>(files, subtitles, false, metrics);
for (Match<File, SubtitleDescriptor> it : matcher.match()) {
if (sanity.getSimilarity(it.getValue(), it.getCandidate()) >= (strict ? 0.9f : 0.6f)) {
subtitleByVideo.put(it.getValue(), it.getCandidate());
}
}
return subtitleByVideo;
}
public static List<SubtitleDescriptor> findSubtitles(SubtitleProvider service, Collection<String> querySet, String languageName) throws Exception {
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
public static Set<SubtitleDescriptor> findSubtitles(SubtitleProvider service, Collection<String> querySet, String languageName) throws Exception {
Set<SubtitleDescriptor> subtitles = new LinkedHashSet<SubtitleDescriptor>();
// search for and automatically select movie / show entry
Set<SearchResult> resultSet = new HashSet<SearchResult>();
for (String query : querySet) {
resultSet.addAll(findProbableSearchResults(query, service.search(query)));
}
// fetch subtitles for all search results
for (SearchResult it : resultSet) {
subtitles.addAll(service.getSubtitleList(it, languageName));
}
return subtitles;
}
protected static Collection<SearchResult> findProbableSearchResults(String query, Iterable<? extends SearchResult> searchResults) {
// auto-select most probable search result
Set<SearchResult> probableMatches = new LinkedHashSet<SearchResult>();
// use name similarity metric
SimilarityMetric metric = new MetricAvg(new SequenceMatchSimilarity(), new NameSimilarityMetric());
// find probable matches using name similarity > threshold
for (SearchResult result : searchResults) {
if (metric.getSimilarity(query, removeTrailingBrackets(result.getName())) > 0.8f || result.getName().toLowerCase().startsWith(query.toLowerCase())) {
probableMatches.add(result);
}
}
return probableMatches;
}
/**
* Detect charset and parse subtitle file even if extension is invalid
*/
public static List<SubtitleElement> decodeSubtitles(MemoryFile file) throws IOException {
// gather all formats, put likely formats first
LinkedList<SubtitleFormat> likelyFormats = new LinkedList<SubtitleFormat>();
for (SubtitleFormat format : SubtitleFormat.values()) {
if (format.getFilter().accept(file.getName()))
likelyFormats.addFirst(format);
else
likelyFormats.addLast(format);
}
// decode bytes
String textfile = getText(file.getData());
// decode subtitle file with the first reader that seems to work
for (SubtitleFormat format : likelyFormats) {
// reset reader to position 0
SubtitleReader parser = format.newReader(new StringReader(textfile));
if (parser.hasNext()) {
// correct format found
List<SubtitleElement> list = new ArrayList<SubtitleElement>(500);
// read subtitle file
while (parser.hasNext()) {
list.add(parser.next());
}
return list;
}
}
// unsupported subtitle format
throw new IOException("Cannot read subtitle format");
}
public static ByteBuffer exportSubtitles(MemoryFile data, SubtitleFormat outputFormat, long outputTimingOffset, Charset outputEncoding) throws IOException {
if (outputFormat != null && outputFormat != SubtitleFormat.SubRip) {
throw new IllegalArgumentException("Format not supported");
}
// convert to target format and target encoding
if (outputFormat == SubtitleFormat.SubRip) {
// output buffer
StringBuilder buffer = new StringBuilder(4 * 1024);
SubRipWriter out = new SubRipWriter(buffer);
for (SubtitleElement it : decodeSubtitles(data)) {
if (outputTimingOffset != 0)
it = new SubtitleElement(max(0, it.getStart() + outputTimingOffset), max(0, it.getEnd() + outputTimingOffset), it.getText());
out.write(it);
}
return outputEncoding.encode(CharBuffer.wrap(buffer));
}
// only change encoding
return outputEncoding.encode(getText(data.getData()));
}
public static SubtitleFormat getSubtitleFormat(File file) {
for (SubtitleFormat it : SubtitleFormat.values()) {
if (it.getFilter().accept(file))
return it;
}
return null;
}
public static SubtitleFormat getSubtitleFormatByName(String name) {
for (SubtitleFormat it : SubtitleFormat.values()) {
// check by name
if (it.name().equalsIgnoreCase(name))
return it;
// check by extension
if (it.getFilter().acceptExtension(name))
return it;
}
return null;
}
public static String formatSubtitle(String name, String languageName, String type) {
StringBuilder sb = new StringBuilder(name);
if (languageName != null) {
String lang = Language.getISO3LanguageCodeByName(languageName);
if (lang == null) {
// we probably won't get here, but just in case
lang = languageName.replaceAll("\\W", "");
}
sb.append('.').append(lang);
}
if (type != null) {
sb.append('.').append(type);
}
return sb.toString();
}
public static MemoryFile fetchSubtitle(SubtitleDescriptor descriptor) throws Exception {
ByteBuffer data = descriptor.fetch();
// extract subtitles from archive
ArchiveType type = ArchiveType.forName(descriptor.getType());
if (type != ArchiveType.UNKOWN) {
// extract subtitle from archive
Iterator<MemoryFile> it = type.fromData(data).iterator();
@ -241,17 +230,16 @@ public final class SubtitleUtilities {
}
}
}
// assume that the fetched data is the subtitle
return new MemoryFile(descriptor.getPath(), data);
}
/**
* Dummy constructor to prevent instantiation.
*/
private SubtitleUtilities() {
throw new UnsupportedOperationException();
}
}

View File

@ -892,7 +892,7 @@ class SubtitleAutoMatchDialog extends JDialog {
@Override
public String getName() {
return String.format("%s (by hash)", service.getName());
return String.format("%s [via hash]", service.getName());
}
@Override
@ -919,7 +919,7 @@ class SubtitleAutoMatchDialog extends JDialog {
@Override
public String getName() {
return String.format("%s (by name)", service.getName());
return String.format("%s [via name]", service.getName());
}
@Override
@ -946,7 +946,7 @@ class SubtitleAutoMatchDialog extends JDialog {
}
}
List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName);
Set<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName);
// dialog may have been cancelled by now
if (Thread.interrupted()) {
@ -978,6 +978,7 @@ class SubtitleAutoMatchDialog extends JDialog {
SimilarityMetric sanity = EpisodeMetrics.verificationMetric();
float minMatchSimilarity = 0.5f;
// this could be very slow, lets hope at this point there is not much left due to positive hash matches
for (File file : files) {
// add matching subtitles
for (SubtitleDescriptor it : subtitles) {