mirror of
https://github.com/mitb-archive/filebot
synced 2025-01-11 13:58:16 -05:00
* performance improvements / switch to series.list.gz
* use before-rule when cleaning up tokens from movie filenames * added series.list.gz script
This commit is contained in:
parent
4d3c2c6f55
commit
806ffdc91d
22
BuildData.groovy
Normal file
22
BuildData.groovy
Normal file
@ -0,0 +1,22 @@
|
||||
def page = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
|
||||
|
||||
def names = page.fetch().getHtml('utf-8')
|
||||
.depthFirst().TABLE.find{it['@id'] == "listtable"}
|
||||
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
|
||||
.findResults{ it.TD[0].A.text() }
|
||||
|
||||
def anime = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles()
|
||||
names += anime.findResults{ it.getPrimaryTitle() }
|
||||
names += anime.findResults{ it.getOfficialTitle('en') }
|
||||
|
||||
names = names.findAll{ it =~ /^[A-Z]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) }
|
||||
names = names.sort().unique()
|
||||
|
||||
|
||||
args[0].withOutputStream{ out ->
|
||||
new java.util.zip.GZIPOutputStream(out).withWriter('utf-8'){ writer ->
|
||||
names.each{ writer.append(it).append('\n') }
|
||||
}
|
||||
}
|
||||
|
||||
println "Series Count: " + names.size()
|
@ -63,8 +63,8 @@ import static net.sourceforge.filebot.web.WebRequest.*
|
||||
|
||||
URL.metaClass.get = { readAll(getReader(delegate.openConnection())) }
|
||||
URL.metaClass.fetch = { fetch(delegate) }
|
||||
URL.metaClass.getHtml = { new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(delegate))) }
|
||||
ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(false, false).parseText(getXmlString(getHtmlDocument(new StringReader(Charset.forName(csn).decode(delegate.duplicate()).toString())))) }
|
||||
URL.metaClass.getHtml = { new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(readAll(getReader(delegate.openConnection()))) }
|
||||
ByteBuffer.metaClass.getHtml = { csn = "utf-8" -> new XmlParser(new org.cyberneko.html.parsers.SAXParser()).parseText(Charset.forName(csn).decode(delegate.duplicate()).toString()) }
|
||||
|
||||
URL.metaClass.post = { Map parameters -> post(delegate.openConnection(), parameters) }
|
||||
URL.metaClass.post = { byte[] data, contentType = 'application/octet-stream' -> post(delegate.openConnection(), data, contentType) }
|
||||
|
@ -25,6 +25,7 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeSet;
|
||||
@ -41,7 +42,6 @@ import net.sourceforge.filebot.similarity.NameSimilarityMetric;
|
||||
import net.sourceforge.filebot.similarity.SeriesNameMatcher;
|
||||
import net.sourceforge.filebot.similarity.SimilarityComparator;
|
||||
import net.sourceforge.filebot.similarity.SimilarityMetric;
|
||||
import net.sourceforge.filebot.web.AnidbClient.AnidbSearchResult;
|
||||
import net.sourceforge.filebot.web.Movie;
|
||||
import net.sourceforge.filebot.web.MovieIdentificationService;
|
||||
import net.sourceforge.filebot.web.SearchResult;
|
||||
@ -155,8 +155,8 @@ public class MediaDetection {
|
||||
}
|
||||
|
||||
// match folder names against known series names
|
||||
for (TheTVDBSearchResult match : matchSeriesByName(filenames.toArray(new String[0]))) {
|
||||
names.put(match.getName().toLowerCase(), match.getName());
|
||||
for (String match : matchSeriesByName(filenames.toArray(new String[0]))) {
|
||||
names.put(match.toLowerCase(), match);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(MediaDetection.class.getClass().getName()).log(Level.WARNING, "Failed to match folder structure: " + e.getMessage(), e);
|
||||
@ -177,75 +177,29 @@ public class MediaDetection {
|
||||
}
|
||||
|
||||
|
||||
private static final HashMap<TheTVDBSearchResult, String> seriesNameIndex = new HashMap<TheTVDBSearchResult, String>(32768);
|
||||
public static List<String> matchSeriesByName(String... names) throws Exception {
|
||||
HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
|
||||
List<String> matches = new ArrayList<String>();
|
||||
|
||||
|
||||
public static List<TheTVDBSearchResult> matchSeriesByName(String... names) throws Exception {
|
||||
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
|
||||
final Map<TheTVDBSearchResult, String> matchMap = new HashMap<TheTVDBSearchResult, String>();
|
||||
|
||||
synchronized (seriesNameIndex) {
|
||||
if (seriesNameIndex.isEmpty()) {
|
||||
for (TheTVDBSearchResult entry : releaseInfo.getSeriesList()) {
|
||||
seriesNameIndex.put(entry, nameMatcher.normalize(entry.getName()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Entry<TheTVDBSearchResult, String> it : seriesNameIndex.entrySet()) {
|
||||
for (String identifier : releaseInfo.getSeriesList()) {
|
||||
for (String name : names) {
|
||||
String identifier = it.getValue();
|
||||
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
|
||||
if (commonName != null && commonName.length() >= identifier.length()) {
|
||||
matchMap.put(it.getKey(), commonName);
|
||||
matches.add(commonName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sort by length of name match (descending)
|
||||
List<TheTVDBSearchResult> results = new ArrayList<TheTVDBSearchResult>(matchMap.keySet());
|
||||
sort(results, new Comparator<TheTVDBSearchResult>() {
|
||||
sort(matches, new Comparator<String>() {
|
||||
|
||||
@Override
|
||||
public int compare(TheTVDBSearchResult a, TheTVDBSearchResult b) {
|
||||
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
|
||||
public int compare(String a, String b) {
|
||||
return Integer.valueOf(b.length()).compareTo(Integer.valueOf(a.length()));
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
public static Collection<AnidbSearchResult> matchAnimeByName(String... names) throws Exception {
|
||||
final HighPerformanceMatcher nameMatcher = new HighPerformanceMatcher(0);
|
||||
final Map<AnidbSearchResult, String> matchMap = new HashMap<AnidbSearchResult, String>();
|
||||
|
||||
for (final AnidbSearchResult entry : WebServices.AniDB.getAnimeTitles()) {
|
||||
for (String identifier : new String[] { entry.getPrimaryTitle(), entry.getOfficialTitle("en") }) {
|
||||
if (identifier == null || identifier.isEmpty())
|
||||
continue;
|
||||
|
||||
identifier = nameMatcher.normalize(identifier);
|
||||
for (String name : names) {
|
||||
String commonName = nameMatcher.matchFirstCommonSequence(name, identifier);
|
||||
if (commonName != null && commonName.length() >= identifier.length()) {
|
||||
matchMap.put(entry, commonName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sort by length of name match (descending)
|
||||
List<AnidbSearchResult> results = new ArrayList<AnidbSearchResult>(matchMap.keySet());
|
||||
sort(results, new Comparator<AnidbSearchResult>() {
|
||||
|
||||
@Override
|
||||
public int compare(AnidbSearchResult a, AnidbSearchResult b) {
|
||||
return Integer.valueOf(matchMap.get(b).length()).compareTo(Integer.valueOf(matchMap.get(a).length()));
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
return matches;
|
||||
}
|
||||
|
||||
|
||||
@ -366,7 +320,11 @@ public class MediaDetection {
|
||||
|
||||
|
||||
public static String stripReleaseInfo(String name) throws IOException {
|
||||
return releaseInfo.cleanRelease(name, true);
|
||||
try {
|
||||
return releaseInfo.cleanRelease(singleton(name), true).iterator().next();
|
||||
} catch (NoSuchElementException e) {
|
||||
return ""; // default value in case all tokens are stripped away
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -6,11 +6,13 @@ import static java.util.Arrays.*;
|
||||
import static java.util.ResourceBundle.*;
|
||||
import static java.util.regex.Pattern.*;
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
import static net.sourceforge.tuned.FileUtilities.*;
|
||||
import static net.sourceforge.tuned.StringUtilities.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Collator;
|
||||
@ -32,7 +34,6 @@ import java.util.zip.GZIPInputStream;
|
||||
|
||||
import net.sourceforge.filebot.web.CachedResource;
|
||||
import net.sourceforge.filebot.web.Movie;
|
||||
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
|
||||
import net.sourceforge.tuned.ByteBufferInputStream;
|
||||
|
||||
|
||||
@ -89,28 +90,32 @@ public class ReleaseInfo {
|
||||
}
|
||||
|
||||
|
||||
public List<String> cleanRelease(Iterable<String> items, boolean strict) throws IOException {
|
||||
public List<String> cleanRelease(Collection<String> items, boolean strict) throws IOException {
|
||||
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
|
||||
return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
|
||||
}
|
||||
|
||||
Pattern releaseGroup = getReleaseGroupPattern(strict);
|
||||
Pattern languageSuffix = getLanguageSuffixPattern(languages);
|
||||
Pattern languageTag = getLanguageTagPattern(languages);
|
||||
Pattern videoSource = getVideoSourcePattern();
|
||||
Pattern videoFormat = getVideoFormatPattern();
|
||||
Pattern resolution = getResolutionPattern();
|
||||
Pattern queryBlacklist = getBlacklistPattern();
|
||||
|
||||
public String cleanRelease(String item, boolean strict) throws IOException {
|
||||
Set<String> languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet();
|
||||
return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages));
|
||||
}
|
||||
Pattern[] blacklist = new Pattern[] { releaseGroup, languageSuffix, languageTag, videoSource, videoFormat, resolution, queryBlacklist };
|
||||
Pattern[] stopwords = new Pattern[] { getReleaseGroupPattern(true), languageSuffix, languageTag, videoSource, videoFormat, resolution };
|
||||
|
||||
|
||||
public List<String> clean(Iterable<String> items, Pattern... blacklisted) {
|
||||
List<String> cleanedItems = new ArrayList<String>();
|
||||
List<String> output = new ArrayList<String>(items.size());
|
||||
for (String it : items) {
|
||||
String cleanedItem = clean(it, blacklisted);
|
||||
if (cleanedItem.length() > 0) {
|
||||
cleanedItems.add(cleanedItem);
|
||||
it = substringBefore(it, stopwords);
|
||||
it = clean(it, blacklist);
|
||||
|
||||
// ignore empty values
|
||||
if (it.length() > 0) {
|
||||
output.add(it);
|
||||
}
|
||||
}
|
||||
|
||||
return cleanedItems;
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
@ -123,7 +128,20 @@ public class ReleaseInfo {
|
||||
}
|
||||
|
||||
|
||||
public Pattern getLanguageOptionPattern(Collection<String> languages) {
|
||||
public String substringBefore(String item, Pattern... stopwords) {
|
||||
for (Pattern it : stopwords) {
|
||||
Matcher matcher = it.matcher(item);
|
||||
if (matcher.find()) {
|
||||
return item.substring(0, matcher.start()); // use substring before the matched stopword
|
||||
}
|
||||
}
|
||||
|
||||
// no stopword found, keep original string
|
||||
return item;
|
||||
}
|
||||
|
||||
|
||||
public Pattern getLanguageTagPattern(Collection<String> languages) {
|
||||
// [en]
|
||||
return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ);
|
||||
}
|
||||
@ -172,7 +190,7 @@ public class ReleaseInfo {
|
||||
}
|
||||
|
||||
|
||||
public synchronized TheTVDBSearchResult[] getSeriesList() throws IOException {
|
||||
public synchronized String[] getSeriesList() throws IOException {
|
||||
return seriesListResource.get();
|
||||
}
|
||||
|
||||
@ -186,7 +204,7 @@ public class ReleaseInfo {
|
||||
protected final CachedResource<String[]> releaseGroupResource = new PatternResource(getBundle(getClass().getName()).getString("url.release-groups"));
|
||||
protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
|
||||
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
|
||||
protected final CachedResource<TheTVDBSearchResult[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
|
||||
protected final CachedResource<String[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
|
||||
|
||||
|
||||
protected static class PatternResource extends CachedResource<String[]> {
|
||||
@ -206,7 +224,7 @@ public class ReleaseInfo {
|
||||
protected static class MovieResource extends CachedResource<Movie[]> {
|
||||
|
||||
public MovieResource(String resource) {
|
||||
super(resource, Movie[].class, 24 * 60 * 60 * 1000); // 24h update interval
|
||||
super(resource, Movie[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
|
||||
}
|
||||
|
||||
|
||||
@ -227,25 +245,16 @@ public class ReleaseInfo {
|
||||
}
|
||||
|
||||
|
||||
protected static class SeriesResource extends CachedResource<TheTVDBSearchResult[]> {
|
||||
protected static class SeriesResource extends CachedResource<String[]> {
|
||||
|
||||
public SeriesResource(String resource) {
|
||||
super(resource, TheTVDBSearchResult[].class, 24 * 60 * 60 * 1000); // 24h update interval
|
||||
super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
|
||||
Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
|
||||
|
||||
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
|
||||
while (scanner.hasNext()) {
|
||||
int sid = scanner.nextInt();
|
||||
String name = scanner.next();
|
||||
tvshows.add(new TheTVDBSearchResult(name, sid));
|
||||
}
|
||||
|
||||
return tvshows.toArray(new TheTVDBSearchResult[0]);
|
||||
public String[] process(ByteBuffer data) throws IOException {
|
||||
return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ url.query-blacklist: http://filebot.sourceforge.net/data/query-blacklist.txt
|
||||
|
||||
# list of all movies (id, name, year)
|
||||
url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz
|
||||
url.series-list: http://filebot.sourceforge.net/data/tvshows.txt.gz
|
||||
url.series-list: http://filebot.sourceforge.net/data/series.list.gz
|
||||
|
||||
# disk folder matcher
|
||||
pattern.diskfolder.entry: ^BDMV$|^HVDVD_TS$|^VIDEO_TS$|^AUDIO_TS$|^VCD$
|
||||
|
@ -2,22 +2,34 @@
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static java.util.regex.Pattern.*;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
public class Normalization {
|
||||
|
||||
private static final Pattern apostrophe = compile("['`´‘’ʻ]+");
|
||||
private static final Pattern punctuation = compile("[\\p{Punct}\\p{Space}]+");
|
||||
|
||||
private static final Pattern[] brackets = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") };
|
||||
|
||||
private static final Pattern checksum = compile("[\\(\\[]\\p{XDigit}{8}[\\]\\)]");
|
||||
|
||||
|
||||
public static String normalizePunctuation(String name) {
|
||||
// remove/normalize special characters
|
||||
name = name.replaceAll("[`´‘’ʻ]+", "");
|
||||
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
|
||||
|
||||
name = apostrophe.matcher(name).replaceAll("");
|
||||
name = punctuation.matcher(name).replaceAll(" ");
|
||||
return name.trim();
|
||||
}
|
||||
|
||||
|
||||
public static String normalizeBrackets(String name) {
|
||||
// remove group names and checksums, any [...] or (...)
|
||||
name = name.replaceAll("\\([^\\(]*\\)", " ");
|
||||
name = name.replaceAll("\\[[^\\[]*\\]", " ");
|
||||
name = name.replaceAll("\\{[^\\{]*\\}", " ");
|
||||
for (Pattern it : brackets) {
|
||||
name = it.matcher(name).replaceAll(" ");
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
||||
@ -25,7 +37,7 @@ public class Normalization {
|
||||
|
||||
public static String removeEmbeddedChecksum(String string) {
|
||||
// match embedded checksum and surrounding brackets
|
||||
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
|
||||
return checksum.matcher(string).replaceAll("");
|
||||
}
|
||||
|
||||
}
|
||||
|
BIN
website/data/series.list.gz
Normal file
BIN
website/data/series.list.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user