From 24f9b8d92a430242741d62cf7a0dd544c872e1e7 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Fri, 15 Feb 2013 09:50:23 +0000 Subject: [PATCH] * fine-tune generic numberic sequence matching (e.g. Bones Staffel 1 Folge 5) --- .../filebot/similarity/EpisodeMetrics.java | 44 ++++++++++++++++--- .../filebot/similarity/Matcher.java | 5 ++- .../similarity/SequenceMatchSimilarity.java | 5 +++ .../filebot/subtitle/SubtitleUtilities.java | 4 +- .../net/sourceforge/tuned/FileUtilities.java | 19 +++++++- website/data/query-blacklist.txt | 1 + website/data/series-mappings.txt | 3 +- website/scripts/amc.groovy | 12 ++--- 8 files changed, 75 insertions(+), 18 deletions(-) diff --git a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java index bed9e137..7fb8ee96 100644 --- a/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java +++ b/source/net/sourceforge/filebot/similarity/EpisodeMetrics.java @@ -8,11 +8,15 @@ import static java.util.Collections.*; import static net.sourceforge.filebot.Settings.*; import static net.sourceforge.filebot.similarity.Normalization.*; import static net.sourceforge.tuned.FileUtilities.*; +import static net.sourceforge.tuned.StringUtilities.*; import java.io.File; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Scanner; import net.sourceforge.filebot.similarity.SeasonEpisodeMatcher.SxE; import net.sourceforge.filebot.vfs.FileInfo; @@ -187,7 +191,7 @@ public enum EpisodeMetrics implements SimilarityMetric { }), // Match via common word sequence in episode name and file name - SubstringSequence(new SequenceMatchSimilarity() { + NameSubstringSequence(new SequenceMatchSimilarity() { @Override public float getSimilarity(Object o1, Object o2) { @@ -199,12 +203,19 @@ public enum EpisodeMetrics implements SimilarityMetric { @Override protected String normalize(Object object) { + if (object instanceof Episode) { + object = removeTrailingBrackets(((Episode) object).getSeriesName()); + } else if (object instanceof Movie) { + object = ((Movie) object).getName(); + } else if (object instanceof File) { + object = getNameWithoutExtension(getRelativePathTail((File) object, 3).getPath()); + } // simplify file name, if possible return normalizeObject(object); } }), - // Match by generic name similarity + // Match by generic name similarity (round rank) Name(new NameSimilarityMetric() { @Override @@ -222,12 +233,33 @@ public enum EpisodeMetrics implements SimilarityMetric { } }), - NumericSequence(new SequenceMatchSimilarity() { + // Match by generic name similarity (absolute) + AbsoluteName(new NameSimilarityMetric() { @Override protected String normalize(Object object) { // simplify file name, if possible - return normalizeObject(object).replaceAll("\\D+", " ").trim(); + return normalizeObject(object); + } + }), + + NumericSequence(new SequenceMatchSimilarity() { + + @Override + protected String normalize(Object object) { + if (object instanceof Episode) { + object = EpisodeFormat.SeasonEpisode.formatSxE((Episode) object); + } else if (object instanceof Movie) { + object = ((Movie) object).getYear(); + } + + // simplify file name if possible and extract numbers + List numbers = new ArrayList(4); + Scanner scanner = new Scanner(normalizeObject(object)).useDelimiter("\\D+"); + while (scanner.hasNextInt()) { + numbers.add(scanner.nextInt()); + } + return join(numbers, " "); } }), @@ -411,9 +443,9 @@ public enum EpisodeMetrics implements SimilarityMetric { // 7 pass: prefer episodes that were aired closer to the last modified date of the file // 8 pass: resolve remaining collisions via absolute string similarity if (includeFileMetrics) { - return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() }; + return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(NameSubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, AbsoluteName }; } else { - return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() }; + return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(NameSubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, AbsoluteName }; } } diff --git a/source/net/sourceforge/filebot/similarity/Matcher.java b/source/net/sourceforge/filebot/similarity/Matcher.java index 594db393..47b5380f 100644 --- a/source/net/sourceforge/filebot/similarity/Matcher.java +++ b/source/net/sourceforge/filebot/similarity/Matcher.java @@ -148,13 +148,14 @@ public class Matcher { for (Match possibleMatch : possibleMatches) { float similarity = metric.getSimilarity(possibleMatch.getValue(), possibleMatch.getCandidate()); - Set> matchSet = similarityMap.get(similarity); + // DEBUG + // System.out.format("%s: %.04f: %s%n", metric, similarity, possibleMatch); + Set> matchSet = similarityMap.get(similarity); if (matchSet == null) { matchSet = new LinkedHashSet>(); similarityMap.put(similarity, matchSet); } - matchSet.add(possibleMatch); // unwind this thread if we have been interrupted diff --git a/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java b/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java index 16ac324c..52896b47 100644 --- a/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java +++ b/source/net/sourceforge/filebot/similarity/SequenceMatchSimilarity.java @@ -24,6 +24,11 @@ public class SequenceMatchSimilarity implements SimilarityMetric { if (match == null) return 0; + return similarity(match, s1, s2); + } + + + protected float similarity(String match, String s1, String s2) { return (float) match.length() / min(s1.length(), s2.length()); } diff --git a/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java b/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java index d874ff77..930de9ff 100644 --- a/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java +++ b/source/net/sourceforge/filebot/subtitle/SubtitleUtilities.java @@ -47,7 +47,7 @@ public final class SubtitleUtilities { Map subtitleByVideo = new LinkedHashMap(); // optimize for generic media <-> subtitle matching - SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringSequence, new MetricCascade(SubstringSequence, Name), Numeric, new NameSimilarityMetric() }; + SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, new NameSimilarityMetric() }; // subtitle verification metric specifically excluding SxE mismatches SimilarityMetric absoluteSeasonEpisode = new SimilarityMetric() { @@ -61,7 +61,7 @@ public final class SubtitleUtilities { return f < 1 ? -1 : 1; } }; - SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(SubstringSequence, Name)); + SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name)); // first match everything as best as possible, then filter possibly bad matches Matcher matcher = new Matcher(files, subtitles, false, metrics); diff --git a/source/net/sourceforge/tuned/FileUtilities.java b/source/net/sourceforge/tuned/FileUtilities.java index da40bde6..b9d3362b 100644 --- a/source/net/sourceforge/tuned/FileUtilities.java +++ b/source/net/sourceforge/tuned/FileUtilities.java @@ -352,9 +352,15 @@ public final class FileUtilities { public static List listPath(File file) { + return listPathTail(file, Integer.MAX_VALUE); + } + + + public static List listPathTail(File file, int tailSize) { LinkedList nodes = new LinkedList(); - for (File node = file; node != null && !UNC_PREFIX.equals(node.toString()); node = node.getParentFile()) { + File node = file; + for (int i = 0; node != null && i < tailSize && !UNC_PREFIX.equals(node.toString()); i++, node = node.getParentFile()) { nodes.addFirst(node); } @@ -362,6 +368,17 @@ public final class FileUtilities { } + public static File getRelativePathTail(File file, int tailSize) { + File f = null; + for (File it : listPathTail(file, tailSize)) { + if (it.getParentFile() != null) { + f = new File(f, it.getName()); + } + } + return f; + } + + public static List listFiles(Iterable folders, int maxDepth, boolean listHiddenFiles) { List files = new ArrayList(); diff --git a/website/data/query-blacklist.txt b/website/data/query-blacklist.txt index 2d59e20a..77949f1d 100644 --- a/website/data/query-blacklist.txt +++ b/website/data/query-blacklist.txt @@ -123,6 +123,7 @@ Hard.Subbed HBO hd HDRip +Hi10P Hindi History.Channel HQ diff --git a/website/data/series-mappings.txt b/website/data/series-mappings.txt index 8c9530ba..1bb00a83 100644 --- a/website/data/series-mappings.txt +++ b/website/data/series-mappings.txt @@ -1,2 +1,3 @@ HIMYM How I Met your Mother -Hml8p Homeland \ No newline at end of file +Hml8p Homeland +NCIS.LA NCIS: Los Angeles \ No newline at end of file diff --git a/website/scripts/amc.groovy b/website/scripts/amc.groovy index 8726403e..91bbcac3 100644 --- a/website/scripts/amc.groovy +++ b/website/scripts/amc.groovy @@ -3,8 +3,8 @@ def input = [] def failOnError = _args.conflict == 'fail' // print input parameters -_args.bindings?.each{ _log.finest("Parameter: $it.key = $it.value") } -args.each{ _log.finest("Argument: $it") } +_args.bindings?.each{ _log.fine("Parameter: $it.key = $it.value") } +args.each{ _log.fine("Argument: $it") } args.findAll{ !it.exists() }.each{ throw new Exception("File not found: $it") } // check user-defined pre-condition @@ -34,7 +34,7 @@ def format = [ tvs: tryQuietly{ seriesFormat } ?: '''TV Shows/{n}/{episode.special ? "Special" : "Season "+s}/{n} - {episode.special ? "S00E"+special.pad(2) : s00e00} - {t}{".$lang"}''', anime: tryQuietly{ animeFormat } ?: '''Anime/{n}/{n} - {sxe} - {t}''', mov: tryQuietly{ movieFormat } ?: '''Movies/{n} ({y})/{n} ({y}){" CD$pi"}{".$lang"}''', - music: tryQuietly{ musicFormat } ?: '''Music/{n}/{album}/{n} - {t}''' + music: tryQuietly{ musicFormat } ?: '''Music/{n}/{album+'/'}{pi.pad(2)+'. '}{artist} - {t}''' ] @@ -117,7 +117,7 @@ def groups = input.groupBy{ f -> def tvs = detectSeriesName(f) def mov = detectMovie(f, false) - println "$f.name [series: $tvs, movie: $mov]" + _log.fine("$f.name [series: $tvs, movie: $mov]") // DECIDE EPISODE VS MOVIE (IF NOT CLEAR) if (tvs && mov) { @@ -129,10 +129,10 @@ def groups = input.groupBy{ f -> // S00E00 | 2012.07.21 | One Piece 217 | Firefly - Serenity | [Taken 1, Taken 2, Taken 3, Taken 4, ..., Taken 10] if (parseEpisodeNumber(fn, true) || parseDate(fn) || (fn =~ sn && parseEpisodeNumber(fn.after(sn), false)) || fn.after(sn) =~ / - .+/ || f.dir.listFiles{ it.isVideo() && norm(it.name) =~ sn && it.name =~ /\b\d{1,3}\b/}.size() >= 10) { - println "Exclude Movie: $mov" + _log.fine("Exclude Movie: $mov") mov = null } else if ((detectMovie(f, true) && [dn, fn].find{ it =~ /(19|20)\d{2}/ }) || [dn, fn].find{ it =~ mn && !(it.after(mn) =~ /\b\d{1,3}\b/) }) { - println "Exclude Series: $tvs" + _log.fine("Exclude Series: $tvs") tvs = null } }