* auto-determine optimal series/files match sets (combine all files per show)

* fine-tune name similarity metric to 5 seperation
This commit is contained in:
Reinhard Pointner 2011-12-23 18:17:20 +00:00
parent 3668b02ed5
commit fe74476232
8 changed files with 215 additions and 140 deletions

View File

@ -8,7 +8,7 @@ import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.filebot.WebServices.*;
import static net.sourceforge.filebot.cli.CLILogging.*;
import static net.sourceforge.filebot.hash.VerificationUtilities.*;
import static net.sourceforge.filebot.mediainfo.ReleaseInfo.*;
import static net.sourceforge.filebot.mediainfo.MediaDetection.*;
import static net.sourceforge.filebot.subtitle.SubtitleUtilities.*;
import static net.sourceforge.tuned.FileUtilities.*;

View File

@ -140,7 +140,7 @@ def parseDate(path) {
}
def detectSeriesName(files) {
def names = ReleaseInfo.detectSeriesNames(files.findAll { it.isVideo() || it.isSubtitle() })
def names = MediaDetection.detectSeriesNames(files.findAll { it.isVideo() || it.isSubtitle() })
return names == null || names.isEmpty() ? null : names[0]
}

View File

@ -0,0 +1,205 @@
package net.sourceforge.filebot.mediainfo;
import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.tuned.FileUtilities.*;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sourceforge.filebot.MediaTypes;
import net.sourceforge.filebot.WebServices;
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.web.SearchResult;
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
public class MediaDetection {
public static Map<Set<File>, Set<String>> mapFoldersBySeriesNames(Collection<File> files) throws Exception {
SortedMap<File, List<File>> filesByFolder = mapByFolder(filter(files, VIDEO_FILES, SUBTITLE_FILES));
// map series names by folder
Map<File, Set<String>> seriesNamesByFolder = new HashMap<File, Set<String>>();
for (Entry<File, List<File>> it : filesByFolder.entrySet()) {
Set<String> namesForFolder = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
namesForFolder.addAll(detectSeriesNames(it.getValue()));
seriesNamesByFolder.put(it.getKey(), namesForFolder);
}
// reverse map folders by series name
Map<String, Set<File>> foldersBySeriesName = new HashMap<String, Set<File>>();
for (Set<String> nameSet : seriesNamesByFolder.values()) {
for (String name : nameSet) {
Set<File> foldersForSeries = new HashSet<File>();
for (Entry<File, Set<String>> it : seriesNamesByFolder.entrySet()) {
if (it.getValue().contains(name)) {
foldersForSeries.add(it.getKey());
}
}
foldersBySeriesName.put(name, foldersForSeries);
}
}
// join both sets
Map<Set<File>, Set<String>> matchSets = new HashMap<Set<File>, Set<String>>();
while (seriesNamesByFolder.size() > 0) {
Set<String> combinedNameSet = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
Set<File> combinedFolderSet = new HashSet<File>();
// build combined match set
combinedFolderSet.add(seriesNamesByFolder.keySet().iterator().next());
boolean resolveFurther = true;
while (resolveFurther) {
boolean modified = false;
for (File folder : combinedFolderSet) {
modified |= combinedNameSet.addAll(seriesNamesByFolder.get(folder));
}
for (String name : combinedNameSet) {
modified |= combinedFolderSet.addAll(foldersBySeriesName.get(name));
}
resolveFurther &= modified;
}
// build result entry
Set<File> combinedFileSet = new TreeSet<File>();
for (File folder : combinedFolderSet) {
combinedFileSet.addAll(filesByFolder.get(folder));
}
matchSets.put(combinedFileSet, combinedNameSet);
// set folders as accounted for
seriesNamesByFolder.keySet().removeAll(combinedFolderSet);
}
return matchSets;
}
public static List<String> detectSeriesNames(Collection<File> files) throws Exception {
// don't allow duplicates
Map<String, String> names = new LinkedHashMap<String, String>();
try {
for (SearchResult it : lookupSeriesNameByInfoFile(files, Locale.ENGLISH)) {
names.put(it.getName().toLowerCase(), it.getName());
}
} catch (Exception e) {
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to lookup info by id: " + e.getMessage(), e);
}
// match common word sequence and clean detected word sequence from unwanted elements
Collection<String> matches = new SeriesNameMatcher().matchAll(files.toArray(new File[files.size()]));
try {
matches = new ReleaseInfo().cleanRG(matches);
} catch (Exception e) {
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to clean matches: " + e.getMessage(), e);
}
for (String it : matches) {
names.put(it.toLowerCase(), it);
}
return new ArrayList<String>(names.values());
}
public static Set<Integer> grepImdbIdFor(File file) throws Exception {
Set<Integer> collection = new LinkedHashSet<Integer>();
for (File nfo : file.getParentFile().listFiles(MediaTypes.getDefaultFilter("application/nfo"))) {
String text = new String(readFile(nfo), "UTF-8");
collection.addAll(grepImdbId(text));
}
return collection;
}
public static Set<SearchResult> lookupSeriesNameByInfoFile(Collection<File> files, Locale language) throws Exception {
Set<SearchResult> names = new LinkedHashSet<SearchResult>();
// search for id in sibling nfo files
for (File folder : mapByFolder(files).keySet()) {
for (File nfo : folder.listFiles(MediaTypes.getDefaultFilter("application/nfo"))) {
String text = new String(readFile(nfo), "UTF-8");
for (int imdbid : grepImdbId(text)) {
TheTVDBSearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);
if (series != null) {
names.add(series);
}
}
for (int tvdbid : grepTheTvdbId(text)) {
TheTVDBSearchResult series = WebServices.TheTVDB.lookupByID(tvdbid, language);
if (series != null) {
names.add(series);
}
}
}
}
return names;
}
public static Set<Integer> grepImdbId(CharSequence text) {
// scan for imdb id patterns like tt1234567
Matcher imdbMatch = Pattern.compile("(?<=tt)\\d{7}").matcher(text);
Set<Integer> collection = new LinkedHashSet<Integer>();
while (imdbMatch.find()) {
collection.add(Integer.parseInt(imdbMatch.group()));
}
return collection;
}
public static Set<Integer> grepTheTvdbId(CharSequence text) {
// scan for thetvdb id patterns like http://www.thetvdb.com/?tab=series&id=78874&lid=14
Set<Integer> collection = new LinkedHashSet<Integer>();
for (String token : Pattern.compile("[\\s\"<>|]+").split(text)) {
try {
URL url = new URL(token);
if (url.getHost().contains("thetvdb") && url.getQuery() != null) {
Matcher idMatch = Pattern.compile("(?<=(^|\\W)id=)\\d+").matcher(url.getQuery());
while (idMatch.find()) {
collection.add(Integer.parseInt(idMatch.group()));
}
}
} catch (MalformedURLException e) {
// parse for thetvdb urls, ignore everything else
}
}
return collection;
}
}

View File

@ -5,144 +5,22 @@ package net.sourceforge.filebot.mediainfo;
import static java.util.ResourceBundle.*;
import static java.util.concurrent.TimeUnit.*;
import static java.util.regex.Pattern.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sourceforge.filebot.MediaTypes;
import net.sourceforge.filebot.WebServices;
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
import net.sourceforge.filebot.web.CachedResource;
import net.sourceforge.filebot.web.SearchResult;
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
public class ReleaseInfo {
public static List<String> detectSeriesNames(Collection<File> files) throws Exception {
ReleaseInfo releaseInfo = new ReleaseInfo();
// don't allow duplicates
Map<String, String> names = new LinkedHashMap<String, String>();
try {
for (SearchResult it : releaseInfo.lookupSeriesNameByInfoFile(files, Locale.ENGLISH)) {
names.put(it.getName().toLowerCase(), it.getName());
}
} catch (Exception e) {
Logger.getLogger(ReleaseInfo.class.getClass().getName()).log(Level.WARNING, "Failed to lookup info by id: " + e.getMessage(), e);
}
// match common word sequence and clean detected word sequence from unwanted elements
Collection<String> matches = new SeriesNameMatcher().matchAll(files.toArray(new File[files.size()]));
try {
matches = releaseInfo.cleanRG(matches);
} catch (Exception e) {
Logger.getLogger(ReleaseInfo.class.getClass().getName()).log(Level.WARNING, "Failed to clean matches: " + e.getMessage(), e);
}
for (String it : matches) {
names.put(it.toLowerCase(), it);
}
return new ArrayList<String>(names.values());
}
public static Set<Integer> grepImdbIdFor(File file) throws Exception {
ReleaseInfo releaseInfo = new ReleaseInfo();
Set<Integer> collection = new LinkedHashSet<Integer>();
for (File nfo : file.getParentFile().listFiles(MediaTypes.getDefaultFilter("application/nfo"))) {
String text = new String(readFile(nfo), "UTF-8");
collection.addAll(releaseInfo.grepImdbId(text));
}
return collection;
}
public Set<SearchResult> lookupSeriesNameByInfoFile(Collection<File> files, Locale language) throws Exception {
Set<SearchResult> names = new LinkedHashSet<SearchResult>();
// search for id in sibling nfo files
for (File folder : mapByFolder(files).keySet()) {
for (File nfo : folder.listFiles(MediaTypes.getDefaultFilter("application/nfo"))) {
String text = new String(readFile(nfo), "UTF-8");
for (int imdbid : grepImdbId(text)) {
TheTVDBSearchResult series = WebServices.TheTVDB.lookupByIMDbID(imdbid, language);
if (series != null) {
names.add(series);
}
}
for (int tvdbid : grepTheTvdbId(text)) {
TheTVDBSearchResult series = WebServices.TheTVDB.lookupByID(tvdbid, language);
if (series != null) {
names.add(series);
}
}
}
}
return names;
}
public Set<Integer> grepImdbId(CharSequence text) {
// scan for imdb id patterns like tt1234567
Matcher imdbMatch = Pattern.compile("(?<=tt)\\d{7}").matcher(text);
Set<Integer> collection = new LinkedHashSet<Integer>();
while (imdbMatch.find()) {
collection.add(Integer.parseInt(imdbMatch.group()));
}
return collection;
}
public Set<Integer> grepTheTvdbId(CharSequence text) {
// scan for thetvdb id patterns like http://www.thetvdb.com/?tab=series&id=78874&lid=14
Set<Integer> collection = new LinkedHashSet<Integer>();
for (String token : Pattern.compile("[\\s\"<>|]+").split(text)) {
try {
URL url = new URL(token);
if (url.getHost().contains("thetvdb") && url.getQuery() != null) {
Matcher idMatch = Pattern.compile("(?<=(^|\\W)id=)\\d+").matcher(url.getQuery());
while (idMatch.find()) {
collection.add(Integer.parseInt(idMatch.group()));
}
}
} catch (MalformedURLException e) {
// parse for thetvdb urls, ignore everything else
}
}
return collection;
}
public String getVideoSource(File file) {
// check parent and itself for group names
return matchLast(getVideoSourcePattern(), file.getParent(), file.getName());

View File

@ -179,9 +179,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
@Override
public float getSimilarity(Object o1, Object o2) {
// normalize absolute similarity to similarity rank (6 ranks in total),
// normalize absolute similarity to similarity rank (5 ranks in total),
// so we are less likely to fall for false positives in this pass, and move on to the next one
return (float) (floor(super.getSimilarity(o1, o2) * 6) / 6);
return (float) (floor(super.getSimilarity(o1, o2) * 5) / 5);
}

View File

@ -4,7 +4,7 @@ package net.sourceforge.filebot.ui.rename;
import static java.util.Collections.*;
import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.filebot.mediainfo.ReleaseInfo.*;
import static net.sourceforge.filebot.mediainfo.MediaDetection.*;
import static net.sourceforge.filebot.web.EpisodeUtilities.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.ui.TunedUtilities.*;
@ -20,7 +20,6 @@ import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
@ -44,7 +43,6 @@ import net.sourceforge.filebot.ui.SelectDialog;
import net.sourceforge.filebot.web.Episode;
import net.sourceforge.filebot.web.EpisodeListProvider;
import net.sourceforge.filebot.web.SearchResult;
import net.sourceforge.tuned.FileUtilities;
class EpisodeListMatcher implements AutoCompleteMatcher {
@ -169,24 +167,18 @@ class EpisodeListMatcher implements AutoCompleteMatcher {
@Override
public List<Match<File, ?>> match(final List<File> files, final Locale locale, final boolean autodetection, final Component parent) throws Exception {
// focus on movie and subtitle files
final List<File> mediaFiles = FileUtilities.filter(files, VIDEO_FILES, SUBTITLE_FILES);
final Map<File, List<File>> filesByFolder = mapByFolder(mediaFiles);
// do matching all at once
if (filesByFolder.keySet().size() <= 5 || detectSeriesNames(mediaFiles).size() <= 5) {
return matchEpisodeSet(mediaFiles, locale, autodetection, parent);
}
final List<File> mediaFiles = filter(files, VIDEO_FILES, SUBTITLE_FILES);;
// assume that many shows will be matched, do it folder by folder
List<Callable<List<Match<File, ?>>>> taskPerFolder = new ArrayList<Callable<List<Match<File, ?>>>>();
// detect series names and create episode list fetch tasks
for (final List<File> folder : filesByFolder.values()) {
for (final Set<File> folder : mapFoldersBySeriesNames(mediaFiles).keySet()) {
taskPerFolder.add(new Callable<List<Match<File, ?>>>() {
@Override
public List<Match<File, ?>> call() throws Exception {
return matchEpisodeSet(folder, locale, autodetection, parent);
return matchEpisodeSet(new ArrayList<File>(folder), locale, autodetection, parent);
}
});
}

View File

@ -5,7 +5,7 @@ package net.sourceforge.filebot.ui.rename;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.sourceforge.filebot.MediaTypes.*;
import static net.sourceforge.filebot.mediainfo.ReleaseInfo.*;
import static net.sourceforge.filebot.mediainfo.MediaDetection.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.ui.TunedUtilities.*;

View File

@ -4,7 +4,7 @@ package net.sourceforge.filebot.ui.subtitle;
import static javax.swing.BorderFactory.*;
import static javax.swing.JOptionPane.*;
import static net.sourceforge.filebot.mediainfo.ReleaseInfo.*;
import static net.sourceforge.filebot.mediainfo.MediaDetection.*;
import static net.sourceforge.filebot.subtitle.SubtitleUtilities.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*;