mirror of
https://github.com/mitb-archive/filebot
synced 2024-12-23 08:18:52 -05:00
164 lines
5.5 KiB
Groovy
164 lines
5.5 KiB
Groovy
// filebot -script BuildData.groovy
|
|
|
|
|
|
// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php
|
|
@Grab('org.jsoup:jsoup')
|
|
import org.jsoup.*
|
|
|
|
def sgl = []
|
|
for (def page = 0; true; page++) {
|
|
def dom = Jsoup.parse(new URL('http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page), 10000)
|
|
def table = dom.select("table[border=1] tr").collect{ it.select("td")*.text()*.trim() }.findAll{ it[2] =~ /\d{4}/ }
|
|
sgl += table
|
|
if (table.empty) break
|
|
}
|
|
sgl = sgl.findAll{ it[1] =~ /\b(DVD|DVDR|HD|TV)\b/ && it[0] =~ /\p{Alpha}/}.findResults{ it[0] }
|
|
sgl = sgl.collect{ it.before(/ - /).trim().space('.') }.collect{ it ==~ /\p{Upper}?\p{Lower}+/ ? it.toUpperCase() : it }
|
|
|
|
// append release group names
|
|
new File('website/data/release-groups.txt') << '\n' << sgl.join('\n')
|
|
|
|
|
|
// ------------------------------------------------------------------------- //
|
|
|
|
|
|
def sortRegexList(path) {
|
|
def set = new TreeSet(String.CASE_INSENSITIVE_ORDER)
|
|
new File(path).eachLine('UTF-8'){
|
|
// check if regex compiles
|
|
set += java.util.regex.Pattern.compile(it).pattern()
|
|
}
|
|
|
|
def out = set.join('\n').saveAs(path)
|
|
println "$out\n$out.text\n"
|
|
}
|
|
|
|
|
|
// sort and check shared regex collections
|
|
sortRegexList("website/data/release-groups.txt")
|
|
sortRegexList("website/data/query-blacklist.txt")
|
|
sortRegexList("website/data/exclude-blacklist.txt")
|
|
sortRegexList("website/data/series-mappings.txt")
|
|
|
|
|
|
// ------------------------------------------------------------------------- //
|
|
|
|
|
|
def series_out = new File("website/data/series.list.gz")
|
|
def movies_out = new File("website/data/movies.txt.gz")
|
|
def thetvdb_out = new File("website/data/thetvdb.txt.gz")
|
|
|
|
def gz(file, lines) {
|
|
file.withOutputStream{ out ->
|
|
new java.util.zip.GZIPOutputStream(out).withWriter('UTF-8'){ writer ->
|
|
lines.each{ writer.append(it).append('\n') }
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// ------------------------------------------------------------------------- //
|
|
|
|
|
|
// LOAD osdb-imdb.txt (already verified data)
|
|
def imdb_tsv = new File("osdb-imdb.txt")
|
|
def imdb = [].asSynchronized() // thread-safe list
|
|
|
|
imdb_tsv.getText('UTF-8').eachLine{
|
|
imdb << it.split(/\t/)
|
|
}
|
|
imdb_ids = new HashSet(imdb.collect{ it[0] })
|
|
|
|
// BUILD movies.txt.gz
|
|
def osdb_tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php")
|
|
def osdb = []
|
|
osdb_tsv.getText('UTF-8').eachLine{
|
|
def line = it.split(/\t/)*.replaceAll(/\s+/, ' ')*.trim()
|
|
if (line.size() == 4 && line[0] =~ /\d+/) {
|
|
osdb << [line[1].toInteger(), line[2], line[3].toInteger()]
|
|
}
|
|
}
|
|
osdb = osdb.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] }
|
|
|
|
|
|
parallel(osdb.collect{ row ->
|
|
return {
|
|
// update new data
|
|
if (!imdb_ids.contains(row[0])) {
|
|
// get original title and english title
|
|
[Locale.ROOT, Locale.ENGLISH].collect{ locale -> net.sourceforge.filebot.WebServices.IMDb.getMovieDescriptor(row[0] as int, locale) }.unique{ it as String }.each { mov ->
|
|
if (mov != null && mov.name.length() > 0 && mov.year > 0) {
|
|
println "Adding $mov"
|
|
imdb << [row[0], mov.name, mov.year]
|
|
} else {
|
|
println "Blacklisting $row"
|
|
imdb << [row[0], null]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}, 20)
|
|
|
|
// save updated imdb data
|
|
imdb.collect{ it.join('\t') }.sort().join('\n').saveAs(imdb_tsv)
|
|
|
|
// save movie data
|
|
def movies = imdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }
|
|
|
|
def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
|
|
movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), it) }
|
|
movies = movieSorter.values().collect{ it.join('\t') }
|
|
|
|
gz(movies_out, movies)
|
|
println "Movie Count: " + movies.size()
|
|
|
|
|
|
// ------------------------------------------------------------------------- //
|
|
|
|
// BUILD thetvdb-index.gz
|
|
def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
|
|
def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8')
|
|
.depthFirst().TABLE.find{it['@id'] == "listtable"}
|
|
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() }
|
|
.findResults{ [it.TD[2].text(), it.TD[0].A.text()] }
|
|
|
|
thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].empty) } as HashSet
|
|
|
|
// join and sort
|
|
def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort()
|
|
gz(thetvdb_out, thetvdb)
|
|
println "TheTVDB Index: " + thetvdb.size()
|
|
|
|
|
|
// BUILD series.list.gz
|
|
|
|
// TheTVDB
|
|
def thetvdb_names = thetvdb_index.findResults{ it[1] }
|
|
|
|
// AniDB
|
|
def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten()
|
|
|
|
/*
|
|
// IMDb series list
|
|
def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] }
|
|
|
|
// Dokuwiki list
|
|
def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name')
|
|
def doku_names = []
|
|
dokuwiki_index.getText('UTF-8').eachLine{
|
|
doku_names << it.trim().replaceTrailingBrackets()
|
|
}
|
|
*/
|
|
|
|
def names = [thetvdb_names, anidb_names]
|
|
names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check
|
|
names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names
|
|
|
|
def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER)
|
|
seriesSorter.addAll(names)
|
|
names = seriesSorter as List
|
|
|
|
|
|
gz(series_out, names)
|
|
println "Series Count: " + names.size()
|