mirror of
https://github.com/mitb-archive/filebot
synced 2024-11-02 08:25:02 -04:00
+ rebuild movie index with imdb AND tmdb IDs
This commit is contained in:
parent
28df8ff69a
commit
75c897bae5
@ -1,6 +1,7 @@
|
|||||||
import org.tukaani.xz.*
|
import org.tukaani.xz.*
|
||||||
|
|
||||||
// ------------------------------------------------------------------------- //
|
|
||||||
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
||||||
def sortRegexList(path) {
|
def sortRegexList(path) {
|
||||||
@ -9,7 +10,6 @@ def sortRegexList(path) {
|
|||||||
// check if regex compiles
|
// check if regex compiles
|
||||||
set += java.util.regex.Pattern.compile(it.trim()).pattern()
|
set += java.util.regex.Pattern.compile(it.trim()).pattern()
|
||||||
}
|
}
|
||||||
|
|
||||||
def out = set.join('\n').saveAs(path)
|
def out = set.join('\n').saveAs(path)
|
||||||
println "$out\n$out.text\n"
|
println "$out\n$out.text\n"
|
||||||
}
|
}
|
||||||
@ -22,11 +22,14 @@ sortRegexList("website/data/exclude-blacklist.txt")
|
|||||||
sortRegexList("website/data/series-mappings.txt")
|
sortRegexList("website/data/series-mappings.txt")
|
||||||
|
|
||||||
|
|
||||||
// ------------------------------------------------------------------------- //
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
||||||
def reviews = []
|
def reviews = []
|
||||||
new File('reviews.csv').eachLine('UTF-8'){ def s = it.split(';', 3); reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^["]|["]$/, '').replaceAll(/["]{2}/, '"')] }
|
new File('reviews.csv').eachLine('UTF-8'){
|
||||||
|
def s = it.split(';', 3)
|
||||||
|
reviews << [user: s[0], date: s[1], text: s[2].replaceAll(/^\"|\"$/, '').replaceAll(/["]{2}/, '"') ]
|
||||||
|
}
|
||||||
reviews = reviews.sort{ it.date }
|
reviews = reviews.sort{ it.date }
|
||||||
|
|
||||||
def json = new groovy.json.JsonBuilder()
|
def json = new groovy.json.JsonBuilder()
|
||||||
@ -35,10 +38,10 @@ json.toPrettyString().saveAs('website/reviews.json')
|
|||||||
println "Reviews: " + reviews.size()
|
println "Reviews: " + reviews.size()
|
||||||
|
|
||||||
|
|
||||||
// ------------------------------------------------------------------------- //
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
||||||
def movies_out = new File("website/data/movies.txt")
|
def movies_out = new File("website/data/moviedb.txt")
|
||||||
def thetvdb_out = new File("website/data/thetvdb.txt")
|
def thetvdb_out = new File("website/data/thetvdb.txt")
|
||||||
def anidb_out = new File("website/data/anidb.txt")
|
def anidb_out = new File("website/data/anidb.txt")
|
||||||
|
|
||||||
@ -51,9 +54,19 @@ def pack(file, lines) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ------------------------------------------------------------------------- //
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
||||||
|
// BUILD moviedb index
|
||||||
|
def treeSort(list, keyFunction) {
|
||||||
|
def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
|
||||||
|
list.each{
|
||||||
|
sorter.put(keyFunction(it), it)
|
||||||
|
}
|
||||||
|
return sorter.values()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// BUILD movies.txt.gz
|
|
||||||
def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
|
def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
|
||||||
new File('omdb.txt').eachLine('Windows-1252'){
|
new File('omdb.txt').eachLine('Windows-1252'){
|
||||||
def line = it.split(/\t/)
|
def line = it.split(/\t/)
|
||||||
@ -67,27 +80,57 @@ new File('omdb.txt').eachLine('Windows-1252'){
|
|||||||
def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0
|
def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0
|
||||||
|
|
||||||
if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) {
|
if ((year >= 1970 && (runtime =~ /h/ || votes >= 200) && rating >= 1 && votes >= 50) || (year >= 1950 && votes >= 5000)) {
|
||||||
omdb << [imdbid, name, year]
|
omdb << [imdbid.pad(7), name, year]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
omdb = omdb.findAll{ it[0] <= 9999999 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ && it[1].length() >= 4}.collect{ [it[0].pad(7), it[1], it[2]] }
|
def isValidMovieName = { s -> s =~ /^[A-Z0-9]/ && s =~ /[\p{Alpha}]{3}/ }
|
||||||
|
omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }
|
||||||
|
|
||||||
// save movie data
|
|
||||||
def movies = omdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
|
|
||||||
def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
|
|
||||||
movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), [it[0], it[2], it[1]]) } // ORDER => ID, YEAR, NAME
|
|
||||||
movies = movieSorter.values().collect{ it.join('\t') }
|
|
||||||
|
|
||||||
pack(movies_out, movies)
|
def tmdb_txt = new File('tmdb.txt')
|
||||||
|
def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
|
||||||
|
|
||||||
|
def tmdb = omdb.findResults{ m ->
|
||||||
|
if (tmdb_index.containsKey(m[0])) {
|
||||||
|
return tmdb_index[m[0]]
|
||||||
|
}
|
||||||
|
|
||||||
|
def sync = System.currentTimeMillis()
|
||||||
|
def row = [sync, m[0].pad(7), 0, m[2], m[1]]
|
||||||
|
try {
|
||||||
|
def info = net.sourceforge.filebot.WebServices.TMDb.getMovieInfo("tt${m[0]}", Locale.ENGLISH, false)
|
||||||
|
def names = [info.name, info.originalName, m[1]]
|
||||||
|
row = [sync, m[0].pad(7), info.id.pad(7), info.released?.year ?: m[2]] + names.findResults{ it ?: '' }
|
||||||
|
} catch(FileNotFoundException e) {
|
||||||
|
}
|
||||||
|
|
||||||
|
println row
|
||||||
|
tmdb_txt << row.join('\t') << '\n'
|
||||||
|
return row
|
||||||
|
}
|
||||||
|
|
||||||
|
movies = tmdb.findResults{
|
||||||
|
def ity = it[1..3] // imdb id, tmdb id, year
|
||||||
|
def names = it[4..-1].findAll{ isValidMovieName(it) }.unique{ it.toLowerCase().normalizePunctuation() }
|
||||||
|
if (ity[1].toInteger() > 0 && names.size() > 0)
|
||||||
|
return ity + names
|
||||||
|
else
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
movies = treeSort(movies, { it[3, 2].join(' ') })
|
||||||
|
|
||||||
|
pack(movies_out, movies.findResults{ it.join('\t') })
|
||||||
println "Movie Count: " + movies.size()
|
println "Movie Count: " + movies.size()
|
||||||
|
|
||||||
// sanity check
|
// sanity check
|
||||||
if (movies.size() < 50000) { throw new Exception('Movie index sanity failed') }
|
if (movies.size() < 40000) { throw new Exception('Movie index sanity failed') }
|
||||||
|
|
||||||
// ------------------------------------------------------------------------- //
|
|
||||||
|
|
||||||
// BUILD thetvdb-index.gz
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|
||||||
|
// BUILD tvdb index
|
||||||
def tvdb = new HashMap()
|
def tvdb = new HashMap()
|
||||||
def tvdb_txt = new File('tvdb.txt')
|
def tvdb_txt = new File('tvdb.txt')
|
||||||
new File('tvdb.txt').eachLine{
|
new File('tvdb.txt').eachLine{
|
||||||
@ -139,9 +182,7 @@ tvdb.values().each{
|
|||||||
|
|
||||||
|
|
||||||
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
|
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
|
||||||
thetvdb_index = thetvdb_index.sort(new Comparator() {
|
thetvdb_index = thetvdb_index.sort(new Comparator() { int compare(a, b) { a[0] <=> b[0] } })
|
||||||
int compare(a, b) { a[0] <=> b[0] }
|
|
||||||
})
|
|
||||||
|
|
||||||
// join and sort
|
// join and sort
|
||||||
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') }
|
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique()).join('\t') }
|
||||||
@ -152,8 +193,10 @@ println "TheTVDB Index: " + thetvdb_txt.size()
|
|||||||
if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') }
|
if (thetvdb_txt.size() < 30000) { throw new Exception('TheTVDB index sanity failed') }
|
||||||
|
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
// BUILD anidb-index.gz
|
|
||||||
|
// BUILD anidb index
|
||||||
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
|
def anidb = new net.sourceforge.filebot.web.AnidbClient(null, 0).getAnimeTitles()
|
||||||
|
|
||||||
def anidb_index = anidb.findResults{
|
def anidb_index = anidb.findResults{
|
||||||
|
@ -3,17 +3,11 @@ import static net.sourceforge.tuned.FileUtilities.*
|
|||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
|
|
||||||
|
|
||||||
// simplified switch/case pattern matching
|
|
||||||
def c(c) { try { c.call() } catch (Throwable e) { null } }
|
|
||||||
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) { def f = path as File; def values = [:]; f.splitEachLine(delim) { line -> values.put(line[keyIndex], valueIndex < line.size() ? line[valueIndex] : null) }; return values }
|
|
||||||
Object.metaClass.match = { Map cases -> def val = delegate; cases.findResult { switch(val) { case it.key: return it.value} } }
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allow getAt() for File paths
|
* Allow getAt() for File paths
|
||||||
*
|
*
|
||||||
* e.g. file[0] -> "F:"
|
* e.g. file[0] -> "F:"
|
||||||
*/
|
*/
|
||||||
File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) }
|
File.metaClass.getAt = { Range range -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(range).join(File.separator) }
|
||||||
File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) }
|
File.metaClass.getAt = { int index -> listPath(delegate).collect{ replacePathSeparators(getName(it)).trim() }.getAt(index) }
|
||||||
File.metaClass.getRoot = { listPath(delegate)[0] }
|
File.metaClass.getRoot = { listPath(delegate)[0] }
|
||||||
@ -22,7 +16,7 @@ File.metaClass.getDiskSpace = { listPath(delegate).reverse().find{ it.exists() }
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convenience methods for String.toLowerCase()and String.toUpperCase()
|
* Convenience methods for String.toLowerCase() and String.toUpperCase()
|
||||||
*/
|
*/
|
||||||
String.metaClass.lower = { toLowerCase() }
|
String.metaClass.lower = { toLowerCase() }
|
||||||
String.metaClass.upper = { toUpperCase() }
|
String.metaClass.upper = { toUpperCase() }
|
||||||
@ -93,10 +87,10 @@ String.metaClass.upperInitial = { replaceAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[a-
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get acronym, i.e. first letter of each word.
|
* Get acronym, i.e. first letter of each word.
|
||||||
*
|
*
|
||||||
* e.g. "Deep Space 9" -> "DS9"
|
* e.g. "Deep Space 9" -> "DS9"
|
||||||
*/
|
*/
|
||||||
String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() }
|
String.metaClass.acronym = { delegate.sortName('$2').findAll(/(?<=[&()+.,-;<=>?\[\]_{|}~ ]|^)[\p{Alnum}]/).join().toUpperCase() }
|
||||||
String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() }
|
String.metaClass.sortName = { replacement = '$2, $1' -> delegate.replaceFirst(/^(?i)(The|A|An)\s(.+)/, replacement).trim() }
|
||||||
|
|
||||||
@ -165,19 +159,49 @@ String.metaClass.transliterate = { transformIdentifier -> com.ibm.icu.text.Trans
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
|
* Convert Unicode to ASCII as best as possible. Works with most alphabets/scripts used in the world.
|
||||||
*
|
*
|
||||||
* e.g. "Österreich" -> "Osterreich"
|
* e.g. "Österreich" -> "Osterreich"
|
||||||
* "カタカナ" -> "katakana"
|
* "カタカナ" -> "katakana"
|
||||||
*/
|
*/
|
||||||
String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) }
|
String.metaClass.ascii = { fallback = ' ' -> delegate.transliterate("Any-Latin;Latin-ASCII;[:Diacritic:]remove").replaceAll("[^\\p{ASCII}]+", fallback) }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* General helpers and utilities
|
||||||
|
*/
|
||||||
|
def c(c) {
|
||||||
|
try {
|
||||||
|
return c.call()
|
||||||
|
} catch (Throwable e) {
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def csv(path, delim = ';', keyIndex = 0, valueIndex = 1) {
|
||||||
|
def f = path as File
|
||||||
|
def values = [:]
|
||||||
|
if (f.isFile()) {
|
||||||
|
f.splitEachLine(delim) { line ->
|
||||||
|
values.put(line[keyIndex], c{ line[valueIndex] })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return values
|
||||||
|
}
|
||||||
|
|
||||||
|
Object.metaClass.match = { Map cases ->
|
||||||
|
def val = delegate;
|
||||||
|
cases.findResult {
|
||||||
|
switch(val) { case it.key: return it.value}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Web and File IO helpers
|
* Web and File IO helpers
|
||||||
*/
|
*/
|
||||||
import net.sourceforge.filebot.web.WebRequest
|
import net.sourceforge.filebot.web.WebRequest
|
||||||
import net.sourceforge.tuned.FileUtilities
|
import net.sourceforge.tuned.FileUtilities
|
||||||
import net.sourceforge.tuned.XPathUtilities
|
import net.sourceforge.tuned.XPathUtilities
|
||||||
@ -190,8 +214,8 @@ URL.metaClass.scrapeAll = { xpath -> XPathUtilities.selectNodes(xpath, WebReques
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* XML / XPath utility functions
|
* XML / XPath utility functions
|
||||||
*/
|
*/
|
||||||
import javax.xml.xpath.XPathFactory
|
import javax.xml.xpath.XPathFactory
|
||||||
import javax.xml.xpath.XPathConstants
|
import javax.xml.xpath.XPathConstants
|
||||||
|
|
||||||
|
@ -328,10 +328,11 @@ public class ReleaseInfo {
|
|||||||
|
|
||||||
for (String[] row : rows) {
|
for (String[] row : rows) {
|
||||||
int imdbid = parseInt(row[0]);
|
int imdbid = parseInt(row[0]);
|
||||||
int year = parseInt(row[1]);
|
int tmdbid = parseInt(row[1]);
|
||||||
String name = row[2];
|
int year = parseInt(row[2]);
|
||||||
String[] aliasNames = copyOfRange(row, 3, row.length);
|
String name = row[3];
|
||||||
movies.add(new Movie(name, aliasNames, year, imdbid, -1));
|
String[] aliasNames = copyOfRange(row, 4, row.length);
|
||||||
|
movies.add(new Movie(name, aliasNames, year, imdbid > 0 ? imdbid : -1, tmdbid > 0 ? tmdbid : -1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return movies.toArray(new Movie[0]);
|
return movies.toArray(new Movie[0]);
|
||||||
|
@ -20,7 +20,7 @@ number.clutter.maxfilesize: 262144000
|
|||||||
url.series-mappings: http://filebot.net/data/series-mappings.txt
|
url.series-mappings: http://filebot.net/data/series-mappings.txt
|
||||||
|
|
||||||
# list of all movies (id, name, year)
|
# list of all movies (id, name, year)
|
||||||
url.movie-list: http://filebot.net/data/movies.txt.xz
|
url.movie-list: http://filebot.net/data/moviedb.txt.xz
|
||||||
|
|
||||||
# TheTVDB index
|
# TheTVDB index
|
||||||
url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz
|
url.thetvdb-index: http://filebot.net/data/thetvdb.txt.xz
|
||||||
|
Loading…
Reference in New Issue
Block a user