From bc7cf8cba03a5ed9ff1eae2e6a766b6aa457aa98 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sat, 25 Jul 2015 22:46:28 +0000 Subject: [PATCH] * fine-tune subtitle matching @see https://www.filebot.net/forums/viewtopic.php?f=8&t=2869 --- .../filebot/similarity/EpisodeMetrics.java | 2 +- .../similarity/SequenceMatchSimilarity.java | 35 +++++------- .../net/filebot/subtitle/SubtitleMetrics.java | 54 ++++++++++++++++++- 3 files changed, 66 insertions(+), 25 deletions(-) diff --git a/source/net/filebot/similarity/EpisodeMetrics.java b/source/net/filebot/similarity/EpisodeMetrics.java index a9c7012b..1e308a51 100644 --- a/source/net/filebot/similarity/EpisodeMetrics.java +++ b/source/net/filebot/similarity/EpisodeMetrics.java @@ -660,7 +660,7 @@ public enum EpisodeMetrics implements SimilarityMetric { private static final Map transformCache = synchronizedMap(new HashMap(64, 4)); private static final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); - protected static String normalizeObject(Object object) { + public static String normalizeObject(Object object) { if (object == null) { return ""; } diff --git a/source/net/filebot/similarity/SequenceMatchSimilarity.java b/source/net/filebot/similarity/SequenceMatchSimilarity.java index 556c0cde..13768b73 100644 --- a/source/net/filebot/similarity/SequenceMatchSimilarity.java +++ b/source/net/filebot/similarity/SequenceMatchSimilarity.java @@ -1,62 +1,53 @@ - package net.filebot.similarity; - import static java.lang.Math.*; import static net.filebot.similarity.CommonSequenceMatcher.*; import static net.filebot.similarity.Normalization.*; import java.util.Locale; - public class SequenceMatchSimilarity implements SimilarityMetric { - + private final CommonSequenceMatcher commonSequenceMatcher; - - + public SequenceMatchSimilarity() { this(10, false); } - - + public SequenceMatchSimilarity(int commonSequenceMaxStartIndex, boolean returnFirstMatch) { this.commonSequenceMatcher = new CommonSequenceMatcher(getLenientCollator(Locale.ROOT), commonSequenceMaxStartIndex, returnFirstMatch); } - - + @Override public float getSimilarity(Object o1, Object o2) { String s1 = normalize(o1); String s2 = normalize(o2); - + // match common word sequence String match = match(s1, s2); - if (match == null) + if (match == null || match.isEmpty()) return 0; - + return similarity(match, s1, s2); } - - + protected float similarity(String match, String s1, String s2) { return (float) match.length() / min(s1.length(), s2.length()); } - - + protected String normalize(Object object) { // use string representation String name = object.toString(); - + // normalize separators name = normalizePunctuation(name); - + // normalize case and trim return name.trim().toLowerCase(); } - - + protected String match(String s1, String s2) { return commonSequenceMatcher.matchFirstCommonSequence(s1, s2); } - + } diff --git a/source/net/filebot/subtitle/SubtitleMetrics.java b/source/net/filebot/subtitle/SubtitleMetrics.java index ed742a28..105808ea 100644 --- a/source/net/filebot/subtitle/SubtitleMetrics.java +++ b/source/net/filebot/subtitle/SubtitleMetrics.java @@ -8,6 +8,7 @@ import static net.filebot.util.FileUtilities.*; import java.io.File; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.WeakHashMap; import java.util.logging.Level; @@ -19,6 +20,7 @@ import net.filebot.media.MetaAttributes; import net.filebot.mediainfo.MediaInfo; import net.filebot.mediainfo.MediaInfo.StreamKind; import net.filebot.similarity.CrossPropertyMetric; +import net.filebot.similarity.EpisodeMetrics; import net.filebot.similarity.MetricAvg; import net.filebot.similarity.MetricCascade; import net.filebot.similarity.NameSimilarityMetric; @@ -68,6 +70,54 @@ public enum SubtitleMetrics implements SimilarityMetric { } }), + NameSubstringSequenceExists(new SequenceMatchSimilarity() { + + @Override + public float getSimilarity(Object o1, Object o2) { + String[] f1 = getNormalizedEffectiveIdentifiers(o1); + String[] f2 = getNormalizedEffectiveIdentifiers(o2); + + for (String s1 : f1) { + for (String s2 : f2) { + if (super.getSimilarity(s1, s2) >= 1) { + return 1; + } + } + } + + return 0; + } + + protected float similarity(String match, String s1, String s2) { + return match.length() > 0 ? 1 : 0; + } + + @Override + protected String normalize(Object object) { + return object.toString(); + } + + protected String[] getNormalizedEffectiveIdentifiers(Object object) { + List identifiers = getEffectiveIdentifiers(object); + String[] names = new String[identifiers.size()]; + + for (int i = 0; i < names.length; i++) { + names[i] = EpisodeMetrics.normalizeObject(identifiers.get(i)); + } + + return names; + } + + protected List getEffectiveIdentifiers(Object object) { + if (object instanceof OpenSubtitlesSubtitleDescriptor) { + return singletonList(((OpenSubtitlesSubtitleDescriptor) object).getName()); + } else if (object instanceof File) { + return listPathTail((File) object, 2, true); + } + return emptyList(); + } + }), + OriginalFileName(new SequenceMatchSimilarity() { @Override @@ -175,11 +225,11 @@ public enum SubtitleMetrics implements SimilarityMetric { } public static SimilarityMetric[] defaultSequence() { - return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, OriginalFileName, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, FileName, DiskNumber, VideoProperties, new NameSimilarityMetric() }; + return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, OriginalFileName, NameSubstringSequenceExists, new MetricAvg(NameSubstringSequenceExists, Name), Numeric, FileName, DiskNumber, VideoProperties, new NameSimilarityMetric() }; } public static SimilarityMetric verificationMetric() { - return new MetricCascade(AbsoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name), getMovieMatchMetric(), OriginalFileName); + return new MetricCascade(AbsoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequenceExists, Name), getMovieMatchMetric(), OriginalFileName); } }