* super charge movie auto-detection using a local movie index and use it for common-word-sequence matching

* use language specific Collator wherever matching movie names / file / so we get perfect matching even with accents, umlauts, half-width/full-width characters, etc
This commit is contained in:
Reinhard Pointner 2012-01-02 03:48:24 +00:00
parent 6707a94518
commit 90cc0a06fa
12 changed files with 270 additions and 96 deletions

View File

@ -98,7 +98,7 @@ public class CmdlineOperations implements CmdlineInterface {
int cws = 0; // common word sequence int cws = 0; // common word sequence
double max = mediaFiles.size(); double max = mediaFiles.size();
SeriesNameMatcher nameMatcher = new SeriesNameMatcher(); SeriesNameMatcher nameMatcher = new SeriesNameMatcher(getLenientCollator(locale));
Collection<String> cwsList = emptySet(); Collection<String> cwsList = emptySet();
if (max >= 5) { if (max >= 5) {
cwsList = nameMatcher.matchAll(mediaFiles.toArray(new File[0])); cwsList = nameMatcher.matchAll(mediaFiles.toArray(new File[0]));
@ -137,7 +137,7 @@ public class CmdlineOperations implements CmdlineInterface {
List<Match<File, Episode>> matches = new ArrayList<Match<File, Episode>>(); List<Match<File, Episode>> matches = new ArrayList<Match<File, Episode>>();
// auto-determine optimal batch sets // auto-determine optimal batch sets
for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles).entrySet()) { for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles, locale).entrySet()) {
List<List<File>> batchSets = new ArrayList<List<File>>(); List<List<File>> batchSets = new ArrayList<List<File>>();
if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) { if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
@ -150,7 +150,7 @@ public class CmdlineOperations implements CmdlineInterface {
for (List<File> batch : batchSets) { for (List<File> batch : batchSets) {
// auto-detect series name if not given // auto-detect series name if not given
Collection<String> seriesNames = (query == null) ? detectQuery(batch, strict) : singleton(query); Collection<String> seriesNames = (query == null) ? detectQuery(batch, locale, strict) : singleton(query);
// fetch episode data // fetch episode data
Set<Episode> episodes = fetchEpisodeSet(db, seriesNames, locale, strict); Set<Episode> episodes = fetchEpisodeSet(db, seriesNames, locale, strict);
@ -297,6 +297,7 @@ public class CmdlineOperations implements CmdlineInterface {
// unknown hash, try via imdb id from nfo file // unknown hash, try via imdb id from nfo file
if (movie == null) { if (movie == null) {
CLILogger.fine(format("Auto-detect movie from context: [%s]", movieFiles[i]));
Collection<Movie> results = detectMovie(movieFiles[i], null, service, locale, strict); Collection<Movie> results = detectMovie(movieFiles[i], null, service, locale, strict);
movie = (Movie) selectSearchResult(query, results, strict).get(0); movie = (Movie) selectSearchResult(query, results, strict).get(0);
@ -463,7 +464,7 @@ public class CmdlineOperations implements CmdlineInterface {
// lookup subtitles via text search, only perform hash lookup in strict mode // lookup subtitles via text search, only perform hash lookup in strict mode
if ((query != null || !strict) && !collector.isComplete()) { if ((query != null || !strict) && !collector.isComplete()) {
// auto-detect search query // auto-detect search query
Collection<String> querySet = (query == null) ? detectQuery(filter(files, VIDEO_FILES), false) : singleton(query); Collection<String> querySet = (query == null) ? detectQuery(filter(files, VIDEO_FILES), language.toLocale(), false) : singleton(query);
for (SubtitleProvider service : WebServices.getSubtitleProviders()) { for (SubtitleProvider service : WebServices.getSubtitleProviders()) {
if (collector.isComplete()) { if (collector.isComplete()) {
@ -618,9 +619,9 @@ public class CmdlineOperations implements CmdlineInterface {
} }
private List<String> detectQuery(Collection<File> mediaFiles, boolean strict) throws Exception { private List<String> detectQuery(Collection<File> mediaFiles, Locale locale, boolean strict) throws Exception {
// detect series name by common word sequence // detect series name by common word sequence
List<String> names = detectSeriesNames(mediaFiles); List<String> names = detectSeriesNames(mediaFiles, locale);
if (names.isEmpty() || (strict && names.size() > 1)) { if (names.isEmpty() || (strict && names.size() > 1)) {
throw new Exception("Unable to auto-select query: " + names); throw new Exception("Unable to auto-select query: " + names);

View File

@ -10,6 +10,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.text.Collator;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
@ -24,6 +25,9 @@ import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -42,15 +46,18 @@ import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
public class MediaDetection { public class MediaDetection {
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files) throws Exception { private static ReleaseInfo releaseInfo = new ReleaseInfo();
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files, Locale locale) throws Exception {
SortedMap<File, List<File>> filesByFolder = mapByFolder(filter(files, VIDEO_FILES, SUBTITLE_FILES)); SortedMap<File, List<File>> filesByFolder = mapByFolder(filter(files, VIDEO_FILES, SUBTITLE_FILES));
// map series names by folder // map series names by folder
Map<File, Set<String>> seriesNamesByFolder = new HashMap<File, Set<String>>(); Map<File, Set<String>> seriesNamesByFolder = new HashMap<File, Set<String>>();
for (Entry<File, List<File>> it : filesByFolder.entrySet()) { for (Entry<File, List<File>> it : filesByFolder.entrySet()) {
Set<String> namesForFolder = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); Set<String> namesForFolder = new TreeSet<String>(getLenientCollator(locale));
namesForFolder.addAll(detectSeriesNames(it.getValue())); namesForFolder.addAll(detectSeriesNames(it.getValue(), locale));
seriesNamesByFolder.put(it.getKey(), namesForFolder); seriesNamesByFolder.put(it.getKey(), namesForFolder);
} }
@ -74,7 +81,7 @@ public class MediaDetection {
Map<Set<File>, Set<String>> batchSets = new HashMap<Set<File>, Set<String>>(); Map<Set<File>, Set<String>> batchSets = new HashMap<Set<File>, Set<String>>();
while (seriesNamesByFolder.size() > 0) { while (seriesNamesByFolder.size() > 0) {
Set<String> combinedNameSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); Set<String> combinedNameSet = new TreeSet<String>(getLenientCollator(locale));
Set<File> combinedFolderSet = new HashSet<File>(); Set<File> combinedFolderSet = new HashSet<File>();
// build combined match set // build combined match set
@ -116,12 +123,12 @@ public class MediaDetection {
} }
public static List<String> detectSeriesNames(Collection<File> files) throws Exception { public static List<String> detectSeriesNames(Collection<File> files, Locale locale) throws Exception {
// don't allow duplicates // don't allow duplicates
Map<String, String> names = new LinkedHashMap<String, String>(); Map<String, String> names = new LinkedHashMap<String, String>();
try { try {
for (SearchResult it : lookupSeriesNameByInfoFile(files, Locale.ENGLISH)) { for (SearchResult it : lookupSeriesNameByInfoFile(files, locale)) {
names.put(it.getName().toLowerCase(), it.getName()); names.put(it.getName().toLowerCase(), it.getName());
} }
} catch (Exception e) { } catch (Exception e) {
@ -129,10 +136,10 @@ public class MediaDetection {
} }
// match common word sequence and clean detected word sequence from unwanted elements // match common word sequence and clean detected word sequence from unwanted elements
Collection<String> matches = new SeriesNameMatcher().matchAll(files.toArray(new File[files.size()])); Collection<String> matches = new SeriesNameMatcher(getLenientCollator(locale)).matchAll(files.toArray(new File[files.size()]));
try { try {
matches = stripReleaseInfo(matches); matches = stripReleaseInfo(matches, true);
} catch (Exception e) { } catch (Exception e) {
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to clean matches: " + e.getMessage(), e); Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to clean matches: " + e.getMessage(), e);
} }
@ -148,6 +155,7 @@ public class MediaDetection {
public static Collection<Movie> detectMovie(File movieFile, MovieIdentificationService hashLookupService, MovieIdentificationService queryLookupService, Locale locale, boolean strict) throws Exception { public static Collection<Movie> detectMovie(File movieFile, MovieIdentificationService hashLookupService, MovieIdentificationService queryLookupService, Locale locale, boolean strict) throws Exception {
Set<Movie> options = new LinkedHashSet<Movie>(); Set<Movie> options = new LinkedHashSet<Movie>();
// lookup by file hash
if (hashLookupService != null) { if (hashLookupService != null) {
for (Movie movie : hashLookupService.getMovieDescriptors(new File[] { movieFile }, locale)) { for (Movie movie : hashLookupService.getMovieDescriptors(new File[] { movieFile }, locale)) {
if (movie != null) { if (movie != null) {
@ -156,58 +164,128 @@ public class MediaDetection {
} }
} }
// lookup by id from nfo file
if (queryLookupService != null) { if (queryLookupService != null) {
// try to grep imdb id from nfo files // try to grep imdb id from nfo files
for (int imdbid : grepImdbIdFor(movieFile)) { for (int imdbid : grepImdbIdFor(movieFile)) {
Movie movie = queryLookupService.getMovieDescriptor(imdbid, locale); Movie movie = queryLookupService.getMovieDescriptor(imdbid, locale);
if (movie != null) { if (movie != null) {
options.add(movie); options.add(movie);
} }
} }
} }
if (queryLookupService != null && !strict && options.isEmpty()) { // search by file name or folder name
// search by file name or folder name List<String> files = new ArrayList<String>();
Collection<String> searchQueries = new LinkedHashSet<String>(); files.add(getName(movieFile));
searchQueries.add(getName(movieFile)); files.add(getName(movieFile.getParentFile()));
searchQueries.add(getName(movieFile.getParentFile()));
long t = System.currentTimeMillis();
// remove blacklisted terms List<Movie> movieNameMatches = matchMovieName(files, locale, strict);
searchQueries = stripReleaseInfo(searchQueries); System.out.println(System.currentTimeMillis() - t);
final SimilarityMetric metric = new NameSimilarityMetric(); // skip further queries if collected matches are already sufficient
final Map<Movie, Float> probabilityMap = new LinkedHashMap<Movie, Float>(); if (options.size() > 0 && movieNameMatches.size() > 0) {
for (String query : searchQueries) { options.addAll(movieNameMatches);
for (Movie movie : queryLookupService.searchMovie(query, locale)) { return options;
probabilityMap.put(movie, metric.getSimilarity(query, movie)); }
}
} // continue gathering more matches if possible
options.addAll(movieNameMatches);
// sort by similarity to original query (descending)
List<Movie> results = new ArrayList<Movie>(probabilityMap.keySet()); // query by file / folder name
sort(results, new Comparator<Movie>() { if (queryLookupService != null && !strict) {
options.addAll(queryMovieByFileName(files, queryLookupService, locale));
@Override
public int compare(Movie a, Movie b) {
return probabilityMap.get(b).compareTo(probabilityMap.get(a));
}
});
options.addAll(results);
} }
return options; return options;
} }
public static String stripReleaseInfo(String name) throws IOException { private static List<Movie> matchMovieName(final List<String> files, final Locale locale, final boolean strict) throws Exception {
return new ReleaseInfo().cleanRelease(name); // cross-reference file / folder name with movie list
final SeriesNameMatcher nameMatcher = new SeriesNameMatcher(String.CASE_INSENSITIVE_ORDER); // use simple comparator for speed (2-3x faster)
final Map<Movie, String> matchMap = synchronizedMap(new HashMap<Movie, String>());
ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
for (final Movie movie : releaseInfo.getMovieList()) {
executor.submit(new Runnable() {
@Override
public void run() {
for (String name : files) {
String movieIdentifier = movie.getName();
String commonName = nameMatcher.matchByFirstCommonWordSequence(name, movieIdentifier);
if (commonName != null && commonName.length() >= movieIdentifier.length()) {
String strictMovieIdentifier = movie.getName() + " " + movie.getYear();
String strictCommonName = nameMatcher.matchByFirstCommonWordSequence(name, strictMovieIdentifier);
if (strictCommonName != null && strictCommonName.length() >= strictMovieIdentifier.length()) {
// prefer strict match
matchMap.put(movie, strictCommonName);
} else if (!strict) {
// make sure the common identifier is not just the year
matchMap.put(movie, commonName);
}
}
}
}
});
}
// wait for last task to finish
executor.shutdown();
executor.awaitTermination(1, TimeUnit.MINUTES);
// sort by length of name match (descending)
List<Movie> results = new ArrayList<Movie>(matchMap.keySet());
sort(results, new Comparator<Movie>() {
@Override
public int compare(Movie a, Movie b) {
return Integer.compare(matchMap.get(b).length(), matchMap.get(a).length());
}
});
return results;
} }
public static List<String> stripReleaseInfo(Collection<String> names) throws IOException { private static Collection<Movie> queryMovieByFileName(List<String> files, MovieIdentificationService queryLookupService, Locale locale) throws Exception {
return new ReleaseInfo().cleanRelease(names); // remove blacklisted terms
Set<String> querySet = new LinkedHashSet<String>();
querySet.addAll(stripReleaseInfo(files, true));
querySet.addAll(stripReleaseInfo(files, false));
final SimilarityMetric metric = new NameSimilarityMetric();
final Map<Movie, Float> probabilityMap = new LinkedHashMap<Movie, Float>();
for (String query : querySet) {
for (Movie movie : queryLookupService.searchMovie(query, locale)) {
probabilityMap.put(movie, metric.getSimilarity(query, movie));
}
}
// sort by similarity to original query (descending)
List<Movie> results = new ArrayList<Movie>(probabilityMap.keySet());
sort(results, new Comparator<Movie>() {
@Override
public int compare(Movie a, Movie b) {
return probabilityMap.get(b).compareTo(probabilityMap.get(a));
}
});
return results;
}
public static String stripReleaseInfo(String name) throws IOException {
return releaseInfo.cleanRelease(name, true);
}
public static List<String> stripReleaseInfo(Collection<String> names, boolean strict) throws IOException {
return releaseInfo.cleanRelease(names, strict);
} }
@ -284,4 +362,13 @@ public class MediaDetection {
return collection; return collection;
} }
public static Comparator<String> getLenientCollator(Locale locale) {
// use maximum strength collator by default
final Collator collator = Collator.getInstance(locale);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.TERTIARY);
return (Comparator) collator;
}
} }

View File

@ -4,6 +4,7 @@ package net.sourceforge.filebot.media;
import static java.util.ResourceBundle.*; import static java.util.ResourceBundle.*;
import static java.util.regex.Pattern.*; import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.StringUtilities.*; import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File; import java.io.File;
@ -11,33 +12,40 @@ import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Scanner;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.sourceforge.filebot.web.CachedResource; import net.sourceforge.filebot.web.CachedResource;
import net.sourceforge.filebot.web.Movie;
import net.sourceforge.tuned.ByteBufferInputStream;
public class ReleaseInfo { public class ReleaseInfo {
public String getVideoSource(File file) { public String getVideoSource(File file) {
// check parent and itself for group names // check parent and itself for group names
return matchLast(getVideoSourcePattern(), file.getParent(), file.getName()); return matchLast(getVideoSourcePattern(), getBundle(getClass().getName()).getString("pattern.video.source").split("[|]"), file.getParent(), file.getName());
} }
public String getReleaseGroup(File file) throws IOException { public String getReleaseGroup(File file) throws IOException {
// check parent and itself for group names // check parent and itself for group names
return matchLast(getReleaseGroupPattern(), file.getParent(), file.getName()); return matchLast(getReleaseGroupPattern(false), releaseGroupResource.get(), file.getParent(), file.getName());
} }
protected String matchLast(Pattern pattern, CharSequence... sequence) { protected String matchLast(Pattern pattern, String[] standardValues, CharSequence... sequence) {
String lastMatch = null; String lastMatch = null;
// match last occurrence
for (CharSequence name : sequence) { for (CharSequence name : sequence) {
if (name == null) if (name == null)
continue; continue;
@ -48,24 +56,36 @@ public class ReleaseInfo {
} }
} }
// prefer standard value over matched value
if (lastMatch != null) {
for (String standard : standardValues) {
if (standard.equalsIgnoreCase(lastMatch)) {
return standard;
}
}
}
return lastMatch; return lastMatch;
} }
public List<String> cleanRelease(Iterable<String> items) throws IOException { public List<String> cleanRelease(Iterable<String> items, boolean strict) throws IOException {
return clean(items, getReleaseGroupPattern(), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern()); return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(false));
} }
public String cleanRelease(String item) throws IOException { public String cleanRelease(String item, boolean strict) throws IOException {
return clean(item, getReleaseGroupPattern(), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern()); return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(false));
} }
public List<String> clean(Iterable<String> items, Pattern... blacklisted) { public List<String> clean(Iterable<String> items, Pattern... blacklisted) {
List<String> cleanedItems = new ArrayList<String>(); List<String> cleanedItems = new ArrayList<String>();
for (String it : items) { for (String it : items) {
cleanedItems.add(clean(it, blacklisted)); String cleanedItem = clean(it, blacklisted);
if (cleanedItem.length() > 0) {
cleanedItems.add(cleanedItem);
}
} }
return cleanedItems; return cleanedItems;
@ -77,7 +97,7 @@ public class ReleaseInfo {
item = it.matcher(item).replaceAll(""); item = it.matcher(item).replaceAll("");
} }
return item.replaceAll("[\\p{Punct}\\p{Space}]+", " ").trim(); return normalizePunctuation(item);
} }
@ -88,14 +108,16 @@ public class ReleaseInfo {
Locale locale = new Locale(code); Locale locale = new Locale(code);
tokens.add(locale.getLanguage()); tokens.add(locale.getLanguage());
tokens.add(locale.getISO3Language()); tokens.add(locale.getISO3Language());
tokens.add(locale.getDisplayLanguage(Locale.ENGLISH)); for (Locale language : new HashSet<Locale>(Arrays.asList(Locale.ENGLISH, Locale.getDefault()))) {
tokens.add(locale.getDisplayLanguage(language));
}
} }
// remove illegal tokens // remove illegal tokens
tokens.remove(""); tokens.remove("");
// .{language}[.srt] // .{language}[.srt]
return compile("(?<=[.])(" + join(tokens, "|") + ")(?=$)", CASE_INSENSITIVE); return compile("(?<=\\p{Punct})(" + join(tokens, "|") + ")(?=$)", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
} }
@ -119,21 +141,27 @@ public class ReleaseInfo {
} }
public Pattern getReleaseGroupPattern() throws IOException { public synchronized Pattern getReleaseGroupPattern(boolean strict) throws IOException {
// pattern matching any release group name enclosed in separators // pattern matching any release group name enclosed in separators
return compile("(?<!\\p{Alnum})(" + join(releaseGroupResource.get(), "|") + ")(?!\\p{Alnum})", CASE_INSENSITIVE); return compile("(?<!\\p{Alnum})(" + join(releaseGroupResource.get(), "|") + ")(?!\\p{Alnum})", strict ? 0 : CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
} }
public Pattern getBlacklistPattern() throws IOException { public synchronized Pattern getBlacklistPattern(boolean strict) throws IOException {
// pattern matching any release group name enclosed in separators // pattern matching any release group name enclosed in separators
return compile("(?<!\\p{Alnum})(" + join(queryBlacklistResource.get(), "|") + ")(?!\\p{Alnum})", CASE_INSENSITIVE); return compile("(?<!\\p{Alnum})(" + join(queryBlacklistResource.get(), "|") + ")(?!\\p{Alnum})", strict ? 0 : CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
}
public synchronized Movie[] getMovieList() throws IOException {
return movieListResource.get();
} }
// fetch release group names online and try to update the data every other day // fetch release group names online and try to update the data every other day
protected final PatternResource releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups")); protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups"));
protected final PatternResource queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
protected static class PatternResource extends CachedResource<String[]> { protected static class PatternResource extends CachedResource<String[]> {
@ -149,4 +177,28 @@ public class ReleaseInfo {
} }
} }
protected static class MovieResource extends CachedResource<Movie[]> {
public MovieResource(String resource) {
super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval
}
@Override
public Movie[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<Movie> movies = new ArrayList<Movie>();
while (scanner.hasNext()) {
int imdbid = scanner.nextInt();
String name = scanner.next();
int year = scanner.nextInt();
movies.add(new Movie(name, year, imdbid));
}
return movies.toArray(new Movie[0]);
}
}
} }

View File

@ -2,10 +2,13 @@
pattern.video.source: CAMRip|CAM|TS|TELESYNC|PDVD|TS|TELESYNC|PDVD|PPV|PPVRip|Screener|SCR|SCREENER|DVDSCR|DVDSCREENER|BDSCR|R5|R5LINE|DVDRip|DVDR|TVRip|DSR|PDTV|HDTV|DVBRip|DTHRip|VODRip|VODR|BDRip|BRRip|BluRay|BDR|WorkPrint|VHS|VCD pattern.video.source: CAMRip|CAM|TS|TELESYNC|PDVD|TS|TELESYNC|PDVD|PPV|PPVRip|Screener|SCR|SCREENER|DVDSCR|DVDSCREENER|BDSCR|R5|R5LINE|DVDRip|DVDR|TVRip|DSR|PDTV|HDTV|DVBRip|DTHRip|VODRip|VODR|BDRip|BRRip|BluRay|BDR|WorkPrint|VHS|VCD
# additional release info patterns # additional release info patterns
pattern.video.format: DivX|Xvid|AVC|x264|h264|3ivx|mpeg|mpeg4|mp3|aac|ac3|2ch|6ch|ws|hr|720p|1080p pattern.video.format: DivX|Xvid|AVC|x264|h264|3ivx|mpeg|mpeg4|mp3|aac|ac3|2ch|6ch|WS|HR|720p|1080p
# group names mostly copied from [http://scenelingo.wordpress.com/list-of-scene-release-groups] # group names mostly copied from [http://scenelingo.wordpress.com/list-of-scene-release-groups]
url.release-groups: http://filebot.sourceforge.net/data/release-groups.txt url.release-groups: http://filebot.sourceforge.net/data/release-groups.txt
# blacklisted terms that will be ignored # blacklisted terms that will be ignored
url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt
# list of all movies (id, name, year)
url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz

View File

@ -3,6 +3,8 @@ package net.sourceforge.filebot.similarity;
import static java.util.Collections.*; import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.StringUtilities.*; import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File; import java.io.File;
@ -28,10 +30,21 @@ import net.sourceforge.tuned.FileUtilities;
public class SeriesNameMatcher { public class SeriesNameMatcher {
protected final SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(new SeasonEpisodeFilter(30, 50, -1), true); protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(new SeasonEpisodeFilter(30, 50, -1), true);
protected final NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric(); protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
protected final int commonWordSequenceMaxStartIndex = 3; protected int commonWordSequenceMaxStartIndex = 3;
protected Comparator<String> commonWordComparator;
public SeriesNameMatcher() {
this(String.CASE_INSENSITIVE_ORDER);
}
public SeriesNameMatcher(Comparator<String> comparator) {
this.commonWordComparator = comparator;
}
public Collection<String> matchAll(File[] files) { public Collection<String> matchAll(File[] files) {
@ -75,7 +88,7 @@ public class SeriesNameMatcher {
whitelist.addAll(deepMatchAll(focus, threshold)); whitelist.addAll(deepMatchAll(focus, threshold));
// 1. use pattern matching // 1. use pattern matching
seriesNames.addAll(flatMatchAll(names, Pattern.compile(join(whitelist, "|"), Pattern.CASE_INSENSITIVE), threshold, false)); seriesNames.addAll(flatMatchAll(names, compile(join(whitelist, "|"), CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ), threshold, false));
// 2. use common word sequences // 2. use common word sequences
seriesNames.addAll(whitelist); seriesNames.addAll(whitelist);
@ -92,7 +105,7 @@ public class SeriesNameMatcher {
* threshold * threshold
*/ */
private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) { private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) {
ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, String.CASE_INSENSITIVE_ORDER); ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, commonWordComparator);
for (String name : names) { for (String name : names) {
// use normalized name // use normalized name
@ -191,7 +204,7 @@ public class SeriesNameMatcher {
common = words; common = words;
} else { } else {
// find common sequence // find common sequence
common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, String.CASE_INSENSITIVE_ORDER); common = firstCommonSequence(common, words, commonWordSequenceMaxStartIndex, commonWordComparator);
if (common == null) { if (common == null) {
// no common sequence // no common sequence
@ -209,14 +222,12 @@ public class SeriesNameMatcher {
protected String normalize(String name) { protected String normalize(String name) {
// remove group names and checksums, any [...] or (...) // remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", " "); name = normalizeBrackets(name);
name = name.replaceAll("\\[[^\\[]*\\]", " ");
// remove/normalize special characters // remove/normalize special characters
name = name.replaceAll("['`´]+", ""); name = normalizePunctuation(name);
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return name.trim(); return name;
} }

View File

@ -173,7 +173,7 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
List<Callable<List<Match<File, ?>>>> taskPerFolder = new ArrayList<Callable<List<Match<File, ?>>>>(); List<Callable<List<Match<File, ?>>>> taskPerFolder = new ArrayList<Callable<List<Match<File, ?>>>>();
// detect series names and create episode list fetch tasks // detect series names and create episode list fetch tasks
for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles).entrySet()) { for (Entry<Set<File>, Set<String>> sameSeriesGroup : mapSeriesNamesByFiles(mediaFiles, locale).entrySet()) {
List<List<File>> batchSets = new ArrayList<List<File>>(); List<List<File>> batchSets = new ArrayList<List<File>>();
if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) { if (sameSeriesGroup.getValue() != null && sameSeriesGroup.getValue().size() > 0) {
@ -219,7 +219,7 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
// detect series name and fetch episode list // detect series name and fetch episode list
if (autodetection) { if (autodetection) {
Collection<String> names = detectSeriesNames(files); Collection<String> names = detectSeriesNames(files, locale);
if (names.size() > 0) { if (names.size() > 0) {
// only allow one fetch session at a time so later requests can make use of cached results // only allow one fetch session at a time so later requests can make use of cached results
synchronized (provider) { synchronized (provider) {

View File

@ -192,6 +192,7 @@ class MovieHashMatcher implements AutoCompleteMatcher {
selectDialog.setTitle(movieFile.getPath()); selectDialog.setTitle(movieFile.getPath());
selectDialog.getHeaderLabel().setText(String.format("Movies matching '%s':", stripReleaseInfo(getName(movieFile)))); selectDialog.getHeaderLabel().setText(String.format("Movies matching '%s':", stripReleaseInfo(getName(movieFile))));
selectDialog.getCancelAction().putValue(Action.NAME, "Ignore"); selectDialog.getCancelAction().putValue(Action.NAME, "Ignore");
selectDialog.pack();
// show dialog // show dialog
selectDialog.setLocation(getOffsetLocation(selectDialog.getOwner())); selectDialog.setLocation(getOffsetLocation(selectDialog.getOwner()));

View File

@ -26,6 +26,7 @@ import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.TreeSet; import java.util.TreeSet;
@ -973,7 +974,7 @@ class SubtitleAutoMatchDialog extends JDialog {
} }
// auto-detect query and search for subtitles // auto-detect query and search for subtitles
Collection<String> querySet = detectSeriesNames(files); Collection<String> querySet = detectSeriesNames(files, Locale.ENGLISH);
List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName); List<SubtitleDescriptor> subtitles = findSubtitles(service, querySet, languageName);
// if auto-detection fails, ask user for input // if auto-detection fails, ask user for input

View File

@ -11,52 +11,61 @@ public class ByteBufferInputStream extends InputStream {
private final ByteBuffer buffer; private final ByteBuffer buffer;
public ByteBufferInputStream(ByteBuffer buffer) { public ByteBufferInputStream(ByteBuffer buffer) {
this.buffer = buffer; this.buffer = buffer;
} }
@Override @Override
public int read() throws IOException { public int read() throws IOException {
if (buffer.remaining() <= 0) return (buffer.position() < buffer.limit()) ? (buffer.get() & 0xff) : -1;
return -1;
return buffer.get();
} }
@Override @Override
public int read(byte[] b, int off, int len) throws IOException { public int read(byte[] b, int off, int len) throws IOException {
if (buffer.remaining() <= 0) if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
}
if (buffer.position() >= buffer.limit()) {
return -1; return -1;
}
int length = Math.min(len, buffer.remaining()); if (len > buffer.remaining()) {
len = buffer.remaining();
}
buffer.get(b, off, length); if (len <= 0) {
return 0;
}
return length; buffer.get(b, off, len);
return len;
} }
@Override @Override
public int available() throws IOException { public int available() throws IOException {
return buffer.remaining(); return buffer.remaining();
} }
@Override @Override
public boolean markSupported() { public boolean markSupported() {
return true; return true;
} }
@Override @Override
public void mark(int readlimit) { public void mark(int readlimit) {
buffer.mark(); buffer.mark();
} }
@Override @Override
public void reset() throws IOException { public void reset() throws IOException {
buffer.reset(); buffer.reset();

BIN
website/data/movies.txt.gz Normal file

Binary file not shown.

View File

@ -1,10 +1,17 @@
PROPER
RETAIL
^(TV.)?(Show|Serie|Anime)[s]?$
^Movie[s]?$
^Video[s]?$
CD[1-3] CD[1-3]
Demonoid Demonoid
ExtraScene ExtraScene
ExtraTorrent ExtraTorrent
PROPER Hard.Subbed
mkvonly
MVGroup.org
READNFO READNFO
REPACK REPACK
RETAIL
ShareReactor ShareReactor
ShareZONE ShareZONE
UsaBit.com

View File

@ -25,6 +25,7 @@ BAJSKORV
BamHD BamHD
Barba Barba
BaSS BaSS
BAUM
BDiSC BDiSC
BiA BiA
BlueTV BlueTV
@ -167,6 +168,7 @@ LMAO
LoD LoD
LOL LOL
LOLCATS LOLCATS
LTT
MAiN MAiN
MainEvent MainEvent
MARiNES MARiNES