mirror of
https://github.com/mitb-archive/filebot
synced 2025-01-11 05:48:01 -05:00
* improved series name matching
* include "tv mini-series" when searching for any series on imdb
This commit is contained in:
parent
3d9839a73f
commit
d98df7d7f1
@ -2,6 +2,7 @@
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static java.util.Collections.*;
|
||||
import static net.sourceforge.tuned.StringUtilities.*;
|
||||
|
||||
import java.io.File;
|
||||
@ -9,7 +10,6 @@ import java.util.AbstractCollection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
@ -18,6 +18,8 @@ import java.util.Map;
|
||||
import java.util.Scanner;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.sourceforge.tuned.FileUtilities;
|
||||
|
||||
@ -25,11 +27,9 @@ import net.sourceforge.tuned.FileUtilities;
|
||||
public class SeriesNameMatcher {
|
||||
|
||||
protected final SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher();
|
||||
protected final NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
|
||||
|
||||
|
||||
public String match(File file) {
|
||||
return match(file.getName(), file.getParent());
|
||||
}
|
||||
protected final int commonWordSequenceMaxStartIndex = 3;
|
||||
|
||||
|
||||
public Collection<String> matchAll(File[] files) {
|
||||
@ -42,9 +42,10 @@ public class SeriesNameMatcher {
|
||||
|
||||
for (String nameMatch : matchAll(names)) {
|
||||
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
|
||||
float similarity = commonMatch == null ? 0 : nameSimilarityMetric.getSimilarity(commonMatch, nameMatch);
|
||||
|
||||
// prefer common match, but use name match if there is no matching word sequence
|
||||
seriesNames.add(commonMatch != null ? commonMatch : nameMatch);
|
||||
// prefer common match, but only if it's very similar to the original match
|
||||
seriesNames.add(similarity > 0.7 ? commonMatch : nameMatch);
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,11 +59,15 @@ public class SeriesNameMatcher {
|
||||
// allow matching of a small number of episodes, by setting threshold = length if length < 5
|
||||
int threshold = Math.min(names.length, 5);
|
||||
|
||||
// 1. use pattern matching with frequency threshold
|
||||
seriesNames.addAll(flatMatchAll(names, threshold));
|
||||
// match common word sequences (likely series names)
|
||||
SeriesNameCollection whitelist = new SeriesNameCollection();
|
||||
whitelist.addAll(deepMatchAll(names, threshold));
|
||||
|
||||
// 2. match common word sequences
|
||||
seriesNames.addAll(deepMatchAll(names, threshold));
|
||||
// 1. use pattern matching
|
||||
seriesNames.addAll(flatMatchAll(names, threshold, Pattern.compile(join(whitelist, "|"), Pattern.CASE_INSENSITIVE)));
|
||||
|
||||
// 2. use common word sequences
|
||||
seriesNames.addAll(whitelist);
|
||||
|
||||
return seriesNames;
|
||||
}
|
||||
@ -75,18 +80,22 @@ public class SeriesNameMatcher {
|
||||
* @return series names that have been matched one or multiple times depending on the
|
||||
* threshold
|
||||
*/
|
||||
private Collection<String> flatMatchAll(String[] names, int threshold) {
|
||||
ThresholdCollection<String> seriesNames = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
|
||||
private Collection<String> flatMatchAll(String[] names, int threshold, Pattern whitelist) {
|
||||
ThresholdCollection<String> results = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER);
|
||||
|
||||
for (String name : names) {
|
||||
String match = matchBySeasonEpisodePattern(name);
|
||||
// use normalized name
|
||||
name = normalize(name);
|
||||
|
||||
if (match != null) {
|
||||
seriesNames.add(match);
|
||||
Matcher matcher = whitelist.matcher(name);
|
||||
int seasonEpisodePosition = seasonEpisodeMatcher.find(name, matcher.find() ? matcher.end() : 0);
|
||||
|
||||
if (seasonEpisodePosition > 0) {
|
||||
results.add(name.substring(0, seasonEpisodePosition).trim());
|
||||
}
|
||||
}
|
||||
|
||||
return seriesNames;
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
@ -99,14 +108,14 @@ public class SeriesNameMatcher {
|
||||
private Collection<String> deepMatchAll(String[] names, int threshold) {
|
||||
// can't use common word sequence matching for less than 2 names
|
||||
if (names.length < 2 || names.length < threshold) {
|
||||
return Collections.emptySet();
|
||||
return emptySet();
|
||||
}
|
||||
|
||||
String common = matchByFirstCommonWordSequence(names);
|
||||
|
||||
if (common != null) {
|
||||
// common word sequence found
|
||||
return Collections.singleton(common);
|
||||
return singleton(common);
|
||||
}
|
||||
|
||||
// recursive divide and conquer
|
||||
@ -120,29 +129,6 @@ public class SeriesNameMatcher {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Match series name using season episode pattern and then try to find a common word
|
||||
* sequence between the first match and the given parent.
|
||||
*
|
||||
* @param name episode name
|
||||
* @param parent a string that contains the series name
|
||||
* @return a likely series name
|
||||
*/
|
||||
public String match(String name, String parent) {
|
||||
String nameMatch = matchBySeasonEpisodePattern(name);
|
||||
|
||||
if (nameMatch != null) {
|
||||
String commonMatch = matchByFirstCommonWordSequence(nameMatch, parent);
|
||||
|
||||
if (commonMatch != null) {
|
||||
return commonMatch;
|
||||
}
|
||||
}
|
||||
|
||||
return nameMatch;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Try to match a series name from the given episode name using known season episode
|
||||
* patterns.
|
||||
@ -185,7 +171,7 @@ public class SeriesNameMatcher {
|
||||
common = words;
|
||||
} else {
|
||||
// find common sequence
|
||||
common = firstCommonSequence(common, words, String.CASE_INSENSITIVE_ORDER);
|
||||
common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, String.CASE_INSENSITIVE_ORDER);
|
||||
|
||||
if (common == null) {
|
||||
// no common sequence
|
||||
@ -202,11 +188,9 @@ public class SeriesNameMatcher {
|
||||
|
||||
|
||||
protected String normalize(String name) {
|
||||
// normalize brackets, convert (...) to [...]
|
||||
name = name.replace('(', '[').replace(')', ']');
|
||||
|
||||
// remove group names, any [...]
|
||||
name = name.replaceAll("\\[[^\\[]+\\]", "");
|
||||
// remove group names and checksums, any [...] or (...)
|
||||
name = name.replaceAll("\\([^\\(]*\\)", "");
|
||||
name = name.replaceAll("\\[[^\\[]*\\]", "");
|
||||
|
||||
// remove special characters
|
||||
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
@ -215,9 +199,9 @@ public class SeriesNameMatcher {
|
||||
}
|
||||
|
||||
|
||||
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, Comparator<T> equalsComparator) {
|
||||
for (int i = 0; i < seq1.length; i++) {
|
||||
for (int j = 0; j < seq2.length; j++) {
|
||||
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, int maxStartIndex, Comparator<T> equalsComparator) {
|
||||
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
|
||||
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
|
||||
// common sequence length
|
||||
int len = 0;
|
||||
|
||||
@ -289,12 +273,11 @@ public class SeriesNameMatcher {
|
||||
|
||||
@Override
|
||||
public boolean add(String value) {
|
||||
String key = value.toLowerCase();
|
||||
String current = data.get(key);
|
||||
String current = data.get(key(value));
|
||||
|
||||
// prefer strings with similar upper/lower case ration (e.g. prefer Roswell over roswell)
|
||||
if (current == null || firstCharacterCaseBalance(current) < firstCharacterCaseBalance(value)) {
|
||||
data.put(key, value);
|
||||
data.put(key(value), value);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -302,6 +285,11 @@ public class SeriesNameMatcher {
|
||||
}
|
||||
|
||||
|
||||
protected String key(Object value) {
|
||||
return value.toString().toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
protected float firstCharacterCaseBalance(String s) {
|
||||
int upper = 0;
|
||||
int lower = 0;
|
||||
@ -323,8 +311,8 @@ public class SeriesNameMatcher {
|
||||
|
||||
|
||||
@Override
|
||||
public boolean contains(Object o) {
|
||||
return data.containsKey(o.toString().toLowerCase());
|
||||
public boolean contains(Object value) {
|
||||
return data.containsKey(key(value));
|
||||
}
|
||||
|
||||
|
||||
|
@ -78,7 +78,7 @@ class AutoFetchEpisodeListMatcher extends SwingWorker<List<Match<File, Episode>>
|
||||
}
|
||||
|
||||
// auto-select most probable search result
|
||||
final List<SearchResult> probableMatches = new LinkedList<SearchResult>();
|
||||
List<SearchResult> probableMatches = new LinkedList<SearchResult>();
|
||||
|
||||
// use name similarity metric
|
||||
SimilarityMetric metric = new NameSimilarityMetric();
|
||||
@ -100,11 +100,8 @@ class AutoFetchEpisodeListMatcher extends SwingWorker<List<Match<File, Episode>>
|
||||
|
||||
@Override
|
||||
public SearchResult call() throws Exception {
|
||||
// display only probable matches if possible
|
||||
List<SearchResult> options = probableMatches.isEmpty() ? searchResults : probableMatches;
|
||||
|
||||
// multiple results have been found, user must select one
|
||||
SelectDialog<SearchResult> selectDialog = new SelectDialog<SearchResult>(null, options);
|
||||
SelectDialog<SearchResult> selectDialog = new SelectDialog<SearchResult>(null, searchResults);
|
||||
|
||||
selectDialog.getHeaderLabel().setText(String.format("Shows matching '%s':", query));
|
||||
selectDialog.getCancelAction().putValue(Action.NAME, "Ignore");
|
||||
|
@ -57,7 +57,7 @@ public class IMDbClient implements EpisodeListProvider {
|
||||
|
||||
Document dom = getHtmlDocument(openConnection(searchUrl));
|
||||
|
||||
List<Node> nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'TV series')]]", dom);
|
||||
List<Node> nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom);
|
||||
|
||||
List<SearchResult> results = new ArrayList<SearchResult>(nodes.size());
|
||||
|
||||
|
@ -3,19 +3,32 @@ package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import net.sourceforge.filebot.similarity.SeriesNameMatcher.SeriesNameCollection;
|
||||
|
||||
|
||||
public class SeriesNameMatcherTest {
|
||||
|
||||
private static SeriesNameMatcher matcher = new SeriesNameMatcher();
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void match() {
|
||||
assertEquals("Test Series", matcher.match("My Test Series - 1x01", "Test Series - Season 1"));
|
||||
public void whitelist() {
|
||||
// ignore recurring word sequences when matching episode patterns
|
||||
String[] names = new String[] { "Test 101 - 01", "Test 101 - 02" };
|
||||
|
||||
assertArrayEquals(new String[] { "Test 101" }, matcher.matchAll(names).toArray());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void threshold() {
|
||||
// ignore recurring word sequences when matching episode patterns
|
||||
String[] names = new String[] { "Test 1 of 101", "Test 2 of 101", "Test 3 of 101" };
|
||||
|
||||
assertArrayEquals(new String[] { "Test" }, matcher.matchAll(names).toArray());
|
||||
}
|
||||
|
||||
|
||||
@ -34,7 +47,7 @@ public class SeriesNameMatcherTest {
|
||||
assertEquals("The Test", matcher.normalize("_The_Test_-_ ..."));
|
||||
|
||||
// brackets
|
||||
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy [D.] [#Monkey]"));
|
||||
assertEquals("Luffy", matcher.normalize("[strawhat] Luffy (D.) [#Monkey]"));
|
||||
|
||||
// invalid brackets
|
||||
assertEquals("strawhat Luffy", matcher.normalize("(strawhat [Luffy (#Monkey)"));
|
||||
@ -43,10 +56,15 @@ public class SeriesNameMatcherTest {
|
||||
|
||||
@Test
|
||||
public void firstCommonSequence() {
|
||||
String[] seq1 = "[abc] Common Name 1".split("\\s");
|
||||
String[] seq2 = "[xyz] Common Name 2".split("\\s");
|
||||
String[] seq1 = "Common Name 1 Any Title".split("\\s");
|
||||
String[] seq2 = "abc xyz Common Name 2 Any Title".split("\\s");
|
||||
|
||||
assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, String.CASE_INSENSITIVE_ORDER));
|
||||
// check if common sequence can be determined
|
||||
assertArrayEquals(new String[] { "Common", "Name" }, matcher.firstCommonSequence(seq1, seq2, 2, String.CASE_INSENSITIVE_ORDER));
|
||||
|
||||
// check if max start index is working
|
||||
assertArrayEquals(null, matcher.firstCommonSequence(seq1, seq2, 0, String.CASE_INSENSITIVE_ORDER));
|
||||
assertArrayEquals(null, matcher.firstCommonSequence(seq2, seq1, 1, String.CASE_INSENSITIVE_ORDER));
|
||||
}
|
||||
|
||||
|
||||
|
@ -28,6 +28,18 @@ public class IMDbClientTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchMiniSeries() throws Exception {
|
||||
List<SearchResult> results = imdb.search("generation kill");
|
||||
|
||||
MovieDescriptor movie = (MovieDescriptor) results.get(0);
|
||||
|
||||
assertEquals("Generation Kill", movie.getName());
|
||||
assertEquals(2008, movie.getYear());
|
||||
assertEquals(995832, movie.getImdbId(), 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void searchNoMatch() throws Exception {
|
||||
List<SearchResult> results = imdb.search("i will not find anything for this query string");
|
||||
|
Loading…
Reference in New Issue
Block a user