1
0
mirror of https://github.com/mitb-archive/filebot synced 2024-12-23 08:18:52 -05:00

* improved series name matching

* include "tv mini-series" when searching for any series on imdb
This commit is contained in:
Reinhard Pointner 2009-08-12 19:35:24 +00:00
parent 3d9839a73f
commit d98df7d7f1
5 changed files with 85 additions and 70 deletions

View File

@ -2,6 +2,7 @@
package net.sourceforge.filebot.similarity;
import static java.util.Collections.*;
import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File;
@ -9,7 +10,6 @@ import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
@ -18,6 +18,8 @@ import java.util.Map;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sourceforge.tuned.FileUtilities;
@ -25,11 +27,9 @@ import net.sourceforge.tuned.FileUtilities;
public class SeriesNameMatcher {
protected final SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher();
protected final NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
public String match(File file) {
return match(file.getName(), file.getParent());
}
protected final int commonWordSequenceMaxStartIndex = 3;
public Collection<String> matchAll(File[] files) {
@ -42,9 +42,10 @@ public class SeriesNameMatcher {
for (String nameMatch : matchAll(names)) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
float similarity = commonMatch == null ? 0 : nameSimilarityMetric.getSimilarity(commonMatch, nameMatch);
// prefer common match, but use name match if there is no matching word sequence
seriesNames.add(commonMatch != null ? commonMatch : nameMatch);
// prefer common match, but only if it's very similar to the original match
seriesNames.add(similarity > 0.7 ? commonMatch : nameMatch);
}
}
@ -58,11 +59,15 @@ public class SeriesNameMatcher {
// allow matching of a small number of episodes, by setting threshold = length if length < 5
int threshold = Math.min(names.length, 5);
// 1. use pattern matching with frequency threshold
seriesNames.addAll(flatMatchAll(names, threshold));
// match common word sequences (likely series names)
SeriesNameCollection whitelist = new SeriesNameCollection();
whitelist.addAll(deepMatchAll(names, threshold));
// 2. match common word sequences
seriesNames.addAll(deepMatchAll(names, threshold));
// 1. use pattern matching
seriesNames.addAll(flatMatchAll(names, threshold, Pattern.compile(join(whitelist, "|"), Pattern.CASE_INSENSITIVE)));
// 2. use common word sequences
seriesNames.addAll(whitelist);
return seriesNames;
}
@ -75,18 +80,22 @@ public class SeriesNameMatcher {
* @return series names that have been matched one or multiple times depending on the
* threshold
*/
private Collection<String> flatMatchAll(String[] names, int threshold) {
ThresholdCollection<String> seriesNames = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
private Collection<String> flatMatchAll(String[] names, int threshold, Pattern whitelist) {
ThresholdCollection<String> results = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
for (String name : names) {
String match = matchBySeasonEpisodePattern(name);
// use normalized name
name = normalize(name);
if (match != null) {
seriesNames.add(match);
Matcher matcher = whitelist.matcher(name);
int seasonEpisodePosition = seasonEpisodeMatcher.find(name, matcher.find() ? matcher.end() : 0);
if (seasonEpisodePosition > 0) {
results.add(name.substring(0, seasonEpisodePosition).trim());
}
}
return seriesNames;
return results;
}
@ -99,14 +108,14 @@ public class SeriesNameMatcher {
private Collection<String> deepMatchAll(String[] names, int threshold) {
// can't use common word sequence matching for less than 2 names
if (names.length < 2 || names.length < threshold) {
return Collections.emptySet();
return emptySet();
}
String common = matchByFirstCommonWordSequence(names);
if (common != null) {
// common word sequence found
return Collections.singleton(common);
return singleton(common);
}
// recursive divide and conquer
@ -120,29 +129,6 @@ public class SeriesNameMatcher {
}
/**
* Match series name using season episode pattern and then try to find a common word
* sequence between the first match and the given parent.
*
* @param name episode name
* @param parent a string that contains the series name
* @return a likely series name
*/
public String match(String name, String parent) {
String nameMatch = matchBySeasonEpisodePattern(name);
if (nameMatch != null) {
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
if (commonMatch != null) {
return commonMatch;
}
}
return nameMatch;
}
/**
* Try to match a series name from the given episode name using known season episode
* patterns.
@ -185,7 +171,7 @@ public class SeriesNameMatcher {
common = words;
} else {
// find common sequence
common = firstCommonSequence(common, words, String.CASE_INSENSITIVE_ORDER);
common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, String.CASE_INSENSITIVE_ORDER);
if (common == null) {
// no common sequence
@ -202,11 +188,9 @@ public class SeriesNameMatcher {
protected String normalize(String name) {
// normalize brackets, convert (...) to [...]
name = name.replace('(', '[').replace(')', ']');
// remove group names, any [...]
name = name.replaceAll("\\[[^\\[]+\\]", "");
// remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", "");
name = name.replaceAll("\\[[^\\[]*\\]", "");
// remove special characters
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
@ -215,9 +199,9 @@ public class SeriesNameMatcher {
}
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, Comparator<T> equalsComparator) {
for (int i = 0; i < seq1.length; i++) {
for (int j = 0; j < seq2.length; j++) {
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, int maxStartIndex, Comparator<T> equalsComparator) {
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
// common sequence length
int len = 0;
@ -289,12 +273,11 @@ public class SeriesNameMatcher {
@Override
public boolean add(String value) {
String key = value.toLowerCase();
String current = data.get(key);
String current = data.get(key(value));
// prefer strings with similar upper/lower case ration (e.g. prefer Roswell over roswell)
if (current == null || firstCharacterCaseBalance(current) < firstCharacterCaseBalance(value)) {
data.put(key, value);
data.put(key(value), value);
return true;
}
@ -302,6 +285,11 @@ public class SeriesNameMatcher {
}
protected String key(Object value) {
return value.toString().toLowerCase();
}
protected float firstCharacterCaseBalance(String s) {
int upper = 0;
int lower = 0;
@ -323,8 +311,8 @@ public class SeriesNameMatcher {
@Override
public boolean contains(Object o) {
return data.containsKey(o.toString().toLowerCase());
public boolean contains(Object value) {
return data.containsKey(key(value));
}

View File

@ -78,7 +78,7 @@ class AutoFetchEpisodeListMatcher extends SwingWorker<List<Match<File, Episode>>
}
// auto-select most probable search result
final List<SearchResult> probableMatches = new LinkedList<SearchResult>();
List<SearchResult> probableMatches = new LinkedList<SearchResult>();
// use name similarity metric
SimilarityMetric metric = new NameSimilarityMetric();
@ -100,11 +100,8 @@ class AutoFetchEpisodeListMatcher extends SwingWorker<List<Match<File, Episode>>
@Override
public SearchResult call() throws Exception {
// display only probable matches if possible
List<SearchResult> options = probableMatches.isEmpty() ? searchResults : probableMatches;
// multiple results have been found, user must select one
SelectDialog<SearchResult> selectDialog = new SelectDialog<SearchResult>(null, options);
SelectDialog<SearchResult> selectDialog = new SelectDialog<SearchResult>(null, searchResults);
selectDialog.getHeaderLabel().setText(String.format("Shows matching '%s':", query));
selectDialog.getCancelAction().putValue(Action.NAME, "Ignore");

View File

@ -57,7 +57,7 @@ public class IMDbClient implements EpisodeListProvider {
Document dom = getHtmlDocument(openConnection(searchUrl));
List<Node> nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'TV series')]]", dom);
List<Node> nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom);
List<SearchResult> results = new ArrayList<SearchResult>(nodes.size());

View File

@ -3,19 +3,32 @@ package net.sourceforge.filebot.similarity;
import static org.junit.Assert.*;
import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection;
import org.junit.Test;
import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection;
public class SeriesNameMatcherTest {
private static SeriesNameMatcher matcher = new SeriesNameMatcher();
@Test
public void match() {
assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1"));
public void whitelist() {
// ignore recurring word sequences when matching episode patterns
String[] names = new String[] { "Test 101 - 01", "Test 101 - 02" };
assertArrayEquals(new String[] { "Test 101" }, matcher.matchAll(names).toArray());
}
@Test
public void threshold() {
// ignore recurring word sequences when matching episode patterns
String[] names = new String[] { "Test 1 of 101", "Test 2 of 101", "Test 3 of 101" };
assertArrayEquals(new String[] { "Test" }, matcher.matchAll(names).toArray());
}
@ -34,7 +47,7 @@ public class SeriesNameMatcherTest {
assertEquals("The Test", matcher.normalize("_The_Test_-_ ..."));
// brackets
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]"));
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy (D.) [#Monkey]"));
// invalid brackets
assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)"));
@ -43,10 +56,15 @@ public class SeriesNameMatcherTest {
@Test
public void firstCommonSequence() {
String[] seq1 = "[abc] Common Name 1".split("\\s");
String[] seq2 = "[xyz] Common Name 2".split("\\s");
String[] seq1 = "Common Name 1 Any Title".split("\\s");
String[] seq2 = "abc xyz Common Name 2 Any Title".split("\\s");
assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, String.CASE_INSENSITIVE_ORDER));
// check if common sequence can be determined
assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, 2, String.CASE_INSENSITIVE_ORDER));
// check if max start index is working
assertArrayEquals(null, matcher.firstCommonSequence(seq1, seq2, 0, String.CASE_INSENSITIVE_ORDER));
assertArrayEquals(null, matcher.firstCommonSequence(seq2, seq1, 1, String.CASE_INSENSITIVE_ORDER));
}

View File

@ -28,6 +28,18 @@ public class IMDbClientTest {
}
@Test
public void searchMiniSeries() throws Exception {
List<SearchResult> results = imdb.search("generation kill");
MovieDescriptor movie = (MovieDescriptor) results.get(0);
assertEquals("Generation Kill", movie.getName());
assertEquals(2008, movie.getYear());
assertEquals(995832, movie.getImdbId(), 0);
}
@Test
public void searchNoMatch() throws Exception {
List<SearchResult> results = imdb.search("i will not find anything for this query string");