diff --git a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java index ffc0730e..b3089289 100644 --- a/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java +++ b/source/net/sourceforge/filebot/similarity/SeriesNameMatcher.java @@ -2,6 +2,7 @@ package net.sourceforge.filebot.similarity; +import static java.util.Collections.*; import static net.sourceforge.tuned.StringUtilities.*; import java.io.File; @@ -9,7 +10,6 @@ import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; @@ -18,6 +18,8 @@ import java.util.Map; import java.util.Scanner; import java.util.TreeMap; import java.util.Map.Entry; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import net.sourceforge.tuned.FileUtilities; @@ -25,11 +27,9 @@ import net.sourceforge.tuned.FileUtilities; public class SeriesNameMatcher { protected final SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(); + protected final NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric(); - - public String match(File file) { - return match(file.getName(), file.getParent()); - } + protected final int commonWordSequenceMaxStartIndex = 3; public Collection matchAll(File[] files) { @@ -42,9 +42,10 @@ public class SeriesNameMatcher { for (String nameMatch : matchAll(names)) { String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent); + float similarity = commonMatch == null ? 0 : nameSimilarityMetric.getSimilarity(commonMatch, nameMatch); - // prefer common match, but use name match if there is no matching word sequence - seriesNames.add(commonMatch != null ? commonMatch : nameMatch); + // prefer common match, but only if it's very similar to the original match + seriesNames.add(similarity > 0.7 ? commonMatch : nameMatch); } } @@ -58,11 +59,15 @@ public class SeriesNameMatcher { // allow matching of a small number of episodes, by setting threshold = length if length < 5 int threshold = Math.min(names.length, 5); - // 1. use pattern matching with frequency threshold - seriesNames.addAll(flatMatchAll(names, threshold)); + // match common word sequences (likely series names) + SeriesNameCollection whitelist = new SeriesNameCollection(); + whitelist.addAll(deepMatchAll(names, threshold)); - // 2. match common word sequences - seriesNames.addAll(deepMatchAll(names, threshold)); + // 1. use pattern matching + seriesNames.addAll(flatMatchAll(names, threshold, Pattern.compile(join(whitelist, "|"), Pattern.CASE_INSENSITIVE))); + + // 2. use common word sequences + seriesNames.addAll(whitelist); return seriesNames; } @@ -75,18 +80,22 @@ public class SeriesNameMatcher { * @return series names that have been matched one or multiple times depending on the * threshold */ - private Collection flatMatchAll(String[] names, int threshold) { - ThresholdCollection seriesNames = new ThresholdCollection(threshold, String.CASE_INSENSITIVE_ORDER); + private Collection flatMatchAll(String[] names, int threshold, Pattern whitelist) { + ThresholdCollection results = new ThresholdCollection(threshold, String.CASE_INSENSITIVE_ORDER); for (String name : names) { - String match = matchBySeasonEpisodePattern(name); + // use normalized name + name = normalize(name); - if (match != null) { - seriesNames.add(match); + Matcher matcher = whitelist.matcher(name); + int seasonEpisodePosition = seasonEpisodeMatcher.find(name, matcher.find() ? matcher.end() : 0); + + if (seasonEpisodePosition > 0) { + results.add(name.substring(0, seasonEpisodePosition).trim()); } } - return seriesNames; + return results; } @@ -99,14 +108,14 @@ public class SeriesNameMatcher { private Collection deepMatchAll(String[] names, int threshold) { // can't use common word sequence matching for less than 2 names if (names.length < 2 || names.length < threshold) { - return Collections.emptySet(); + return emptySet(); } String common = matchByFirstCommonWordSequence(names); if (common != null) { // common word sequence found - return Collections.singleton(common); + return singleton(common); } // recursive divide and conquer @@ -120,29 +129,6 @@ public class SeriesNameMatcher { } - /** - * Match series name using season episode pattern and then try to find a common word - * sequence between the first match and the given parent. - * - * @param name episode name - * @param parent a string that contains the series name - * @return a likely series name - */ - public String match(String name, String parent) { - String nameMatch = matchBySeasonEpisodePattern(name); - - if (nameMatch != null) { - String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent); - - if (commonMatch != null) { - return commonMatch; - } - } - - return nameMatch; - } - - /** * Try to match a series name from the given episode name using known season episode * patterns. @@ -185,7 +171,7 @@ public class SeriesNameMatcher { common = words; } else { // find common sequence - common = firstCommonSequence(common, words, String.CASE_INSENSITIVE_ORDER); + common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, String.CASE_INSENSITIVE_ORDER); if (common == null) { // no common sequence @@ -202,11 +188,9 @@ public class SeriesNameMatcher { protected String normalize(String name) { - // normalize brackets, convert (...) to [...] - name = name.replace('(', '[').replace(')', ']'); - - // remove group names, any [...] - name = name.replaceAll("\\[[^\\[]+\\]", ""); + // remove group names and checksums, any [...] or (...) + name = name.replaceAll("\\([^\\(]*\\)", ""); + name = name.replaceAll("\\[[^\\[]*\\]", ""); // remove special characters name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " "); @@ -215,9 +199,9 @@ public class SeriesNameMatcher { } - protected T[] firstCommonSequence(T[] seq1, T[] seq2, Comparator equalsComparator) { - for (int i = 0; i < seq1.length; i++) { - for (int j = 0; j < seq2.length; j++) { + protected T[] firstCommonSequence(T[] seq1, T[] seq2, int maxStartIndex, Comparator equalsComparator) { + for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) { + for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) { // common sequence length int len = 0; @@ -289,12 +273,11 @@ public class SeriesNameMatcher { @Override public boolean add(String value) { - String key = value.toLowerCase(); - String current = data.get(key); + String current = data.get(key(value)); // prefer strings with similar upper/lower case ration (e.g. prefer Roswell over roswell) if (current == null || firstCharacterCaseBalance(current) < firstCharacterCaseBalance(value)) { - data.put(key, value); + data.put(key(value), value); return true; } @@ -302,6 +285,11 @@ public class SeriesNameMatcher { } + protected String key(Object value) { + return value.toString().toLowerCase(); + } + + protected float firstCharacterCaseBalance(String s) { int upper = 0; int lower = 0; @@ -323,8 +311,8 @@ public class SeriesNameMatcher { @Override - public boolean contains(Object o) { - return data.containsKey(o.toString().toLowerCase()); + public boolean contains(Object value) { + return data.containsKey(key(value)); } diff --git a/source/net/sourceforge/filebot/ui/panel/rename/AutoFetchEpisodeListMatcher.java b/source/net/sourceforge/filebot/ui/panel/rename/AutoFetchEpisodeListMatcher.java index 3c560239..98b81d2d 100644 --- a/source/net/sourceforge/filebot/ui/panel/rename/AutoFetchEpisodeListMatcher.java +++ b/source/net/sourceforge/filebot/ui/panel/rename/AutoFetchEpisodeListMatcher.java @@ -78,7 +78,7 @@ class AutoFetchEpisodeListMatcher extends SwingWorker> } // auto-select most probable search result - final List probableMatches = new LinkedList(); + List probableMatches = new LinkedList(); // use name similarity metric SimilarityMetric metric = new NameSimilarityMetric(); @@ -100,11 +100,8 @@ class AutoFetchEpisodeListMatcher extends SwingWorker> @Override public SearchResult call() throws Exception { - // display only probable matches if possible - List options = probableMatches.isEmpty() ? searchResults : probableMatches; - // multiple results have been found, user must select one - SelectDialog selectDialog = new SelectDialog(null, options); + SelectDialog selectDialog = new SelectDialog(null, searchResults); selectDialog.getHeaderLabel().setText(String.format("Shows matching '%s':", query)); selectDialog.getCancelAction().putValue(Action.NAME, "Ignore"); diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index b72e20b2..94adf889 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -57,7 +57,7 @@ public class IMDbClient implements EpisodeListProvider { Document dom = getHtmlDocument(openConnection(searchUrl)); - List nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'TV series')]]", dom); + List nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom); List results = new ArrayList(nodes.size()); diff --git a/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java b/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java index d3575c8b..b44eb46b 100644 --- a/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java +++ b/test/net/sourceforge/filebot/similarity/SeriesNameMatcherTest.java @@ -3,19 +3,32 @@ package net.sourceforge.filebot.similarity; import static org.junit.Assert.*; -import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection; import org.junit.Test; +import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection; + public class SeriesNameMatcherTest { private static SeriesNameMatcher matcher = new SeriesNameMatcher(); - + @Test - public void match() { - assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1")); + public void whitelist() { + // ignore recurring word sequences when matching episode patterns + String[] names = new String[] { "Test 101 - 01", "Test 101 - 02" }; + + assertArrayEquals(new String[] { "Test 101" }, matcher.matchAll(names).toArray()); + } + + + @Test + public void threshold() { + // ignore recurring word sequences when matching episode patterns + String[] names = new String[] { "Test 1 of 101", "Test 2 of 101", "Test 3 of 101" }; + + assertArrayEquals(new String[] { "Test" }, matcher.matchAll(names).toArray()); } @@ -34,7 +47,7 @@ public class SeriesNameMatcherTest { assertEquals("The Test", matcher.normalize("_The_Test_-_ ...")); // brackets - assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]")); + assertEquals("Luffy", matcher.normalize("[strawhat] Luffy (D.) [#Monkey]")); // invalid brackets assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)")); @@ -43,10 +56,15 @@ public class SeriesNameMatcherTest { @Test public void firstCommonSequence() { - String[] seq1 = "[abc] Common Name 1".split("\\s"); - String[] seq2 = "[xyz] Common Name 2".split("\\s"); + String[] seq1 = "Common Name 1 Any Title".split("\\s"); + String[] seq2 = "abc xyz Common Name 2 Any Title".split("\\s"); - assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, String.CASE_INSENSITIVE_ORDER)); + // check if common sequence can be determined + assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, 2, String.CASE_INSENSITIVE_ORDER)); + + // check if max start index is working + assertArrayEquals(null, matcher.firstCommonSequence(seq1, seq2, 0, String.CASE_INSENSITIVE_ORDER)); + assertArrayEquals(null, matcher.firstCommonSequence(seq2, seq1, 1, String.CASE_INSENSITIVE_ORDER)); } diff --git a/test/net/sourceforge/filebot/web/IMDbClientTest.java b/test/net/sourceforge/filebot/web/IMDbClientTest.java index 55530eda..c6c28660 100644 --- a/test/net/sourceforge/filebot/web/IMDbClientTest.java +++ b/test/net/sourceforge/filebot/web/IMDbClientTest.java @@ -28,6 +28,18 @@ public class IMDbClientTest { } + @Test + public void searchMiniSeries() throws Exception { + List results = imdb.search("generation kill"); + + MovieDescriptor movie = (MovieDescriptor) results.get(0); + + assertEquals("Generation Kill", movie.getName()); + assertEquals(2008, movie.getYear()); + assertEquals(995832, movie.getImdbId(), 0); + } + + @Test public void searchNoMatch() throws Exception { List results = imdb.search("i will not find anything for this query string");