From 52b302e3a47bf84fa47ebc09c1481329b5a70e7b Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Tue, 3 Jan 2012 09:23:03 +0000 Subject: [PATCH] * improved matching by using existing common word sequence name matching --- .../filebot/media/MediaDetection.java | 2 +- .../filebot/similarity/EpisodeMetrics.java | 36 +++++++++++- .../similarity/SequenceMatchSimilarity.java | 58 +++++++++++++++++++ .../filebot/similarity/SeriesNameMatcher.java | 12 +++- 4 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java diff --git a/source/net/sourceforge/filebot/media/MediaDetection.java b/source/net/sourceforge/filebot/media/MediaDetection.java index 34f7b8c9..9aa7eabc 100644 --- a/source/net/sourceforge/filebot/media/MediaDetection.java +++ b/source/net/sourceforge/filebot/media/MediaDetection.java @@ -46,7 +46,7 @@ import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult; public class MediaDetection { - private static ReleaseInfo releaseInfo = new ReleaseInfo(); + private static final ReleaseInfo releaseInfo = new ReleaseInfo(); public static Map, Set> mapSeriesNamesByFiles(Collection files, Locale locale) throws Exception { diff --git a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java index d619770f..47e7e691 100644 --- a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java +++ b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java @@ -174,6 +174,24 @@ public enum EpisodeMetrics implements SimilarityMetric { } }), + // Match via common word sequence in episode name and file name + SubstringSequence(new SequenceMatchSimilarity() { + + @Override + public float getSimilarity(Object o1, Object o2) { + // normalize absolute similarity to similarity rank (5 ranks in total), + // so we are less likely to fall for false positives in this pass, and move on to the next one + return (float) (floor(super.getSimilarity(o1, o2) * 5) / 5); + } + + + @Override + protected String normalize(Object object) { + // simplify file name, if possible + return normalizeObject(object); + } + }), + // Match by generic name similarity Name(new NameSimilarityMetric() { @@ -273,7 +291,15 @@ public enum EpisodeMetrics implements SimilarityMetric { } + private static final Map transformCache = synchronizedMap(new WeakHashMap(64, 4)); + + protected static String normalizeObject(Object object) { + String result = transformCache.get(object); + if (result != null) { + return result; + } + String name = object.toString(); // use name without extension @@ -289,7 +315,11 @@ public enum EpisodeMetrics implements SimilarityMetric { // remove/normalize special characters name = normalizePunctuation(name); - return name.toLowerCase(); + // normalize to lower case + name.toLowerCase(); + + transformCache.put(object, name); + return name; } @@ -301,9 +331,9 @@ public enum EpisodeMetrics implements SimilarityMetric { // 6 pass: divide by generic numeric similarity // 7 pass: resolve remaining collisions via absolute string similarity if (includeFileMetrics) { - return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() }; + return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric }; } else { - return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() }; + return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric }; } } diff --git a/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java b/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java new file mode 100644 index 00000000..3b1f7c19 --- /dev/null +++ b/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java @@ -0,0 +1,58 @@ + +package net.sourceforge.filebot.similarity; + + +import static java.lang.Math.*; +import static net.sourceforge.filebot.similarity.Normalization.*; + +import java.text.Collator; +import java.util.Comparator; +import java.util.Locale; + + +public class SequenceMatchSimilarity implements SimilarityMetric { + + @Override + public float getSimilarity(Object o1, Object o2) { + String s1 = normalize(o1); + String s2 = normalize(o2); + + // match common word sequence + String match = match(s1, s2); + if (match == null) + return 0; + + return (float) match.length() / min(s1.length(), s2.length()); + } + + + protected String normalize(Object object) { + // use string representation + String name = object.toString(); + + // normalize separators + name = normalizePunctuation(name); + + // normalize case and trim + return name.trim().toLowerCase(); + } + + + protected String match(String s1, String s2) { + // use maximum strength collator by default + Collator collator = Collator.getInstance(Locale.ROOT); + collator.setDecomposition(Collator.FULL_DECOMPOSITION); + collator.setStrength(Collator.TERTIARY); + + @SuppressWarnings("unchecked") + SeriesNameMatcher matcher = new SeriesNameMatcher((Comparator) collator, 10) { + + @Override + protected String normalize(String name) { + return name; // assume normalization has been done, no need to do that here again + }; + }; + + return matcher.matchByFirstCommonWordSequence(s1, s2); + } +} diff --git a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java index bfdd76bb..e5cf2f21 100644 --- a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java +++ b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java @@ -33,17 +33,23 @@ public class SeriesNameMatcher { protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(new SeasonEpisodeFilter(30, 50, -1), true); protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric(); - protected int commonWordSequenceMaxStartIndex = 3; + protected int commonWordSequenceMaxStartIndex; protected Comparator commonWordComparator; public SeriesNameMatcher() { - this(String.CASE_INSENSITIVE_ORDER); + this(String.CASE_INSENSITIVE_ORDER, 3); } public SeriesNameMatcher(Comparator comparator) { - this.commonWordComparator = comparator; + this(comparator, 3); + } + + + public SeriesNameMatcher(Comparator commonWordComparator, int commonWordSequenceMaxStartIndex) { + this.commonWordSequenceMaxStartIndex = commonWordSequenceMaxStartIndex; + this.commonWordComparator = commonWordComparator; }