filebot/BuildData.groovy

// filebot -script BuildData.groovy


// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php
@Grab('org.jsoup:jsoup')
import org.jsoup.*

def sgl = []
for (def page = 0; true; page++) {
	def dom = Jsoup.parse(new URL('http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page), 10000)
	def table = dom.select("table[border=1] tr").collect{ it.select("td")*.text()*.trim() }.findAll{ it[2] =~ /\d{4}/ }
	sgl += table
	if (table.empty) break
}
sgl = sgl.findAll{ it[1] =~ /\b(DVD|DVDR|HD|TV)\b/ && it[0] =~ /\p{Alpha}/}.findResults{ it[0] }
sgl = sgl.collect{ it.before(/ - /).trim().space('.') }.collect{ it ==~ /\p{Upper}?\p{Lower}+/ ? it.toUpperCase() : it }

// append release group names
new File('website/data/release-groups.txt') << '\n' << sgl.join('\n')


// ------------------------------------------------------------------------- //


def sortRegexList(path) {
	def set = new TreeSet(String.CASE_INSENSITIVE_ORDER)
	new File(path).eachLine('UTF-8'){
		// check if regex compiles
		set += java.util.regex.Pattern.compile(it).pattern()
	}
	
	def out = set.join('\n').saveAs(path)
	println "$out\n$out.text\n"
}


// sort and check shared regex collections
sortRegexList("website/data/release-groups.txt")
sortRegexList("website/data/query-blacklist.txt")
sortRegexList("website/data/exclude-blacklist.txt")
sortRegexList("website/data/series-mappings.txt")


// ------------------------------------------------------------------------- //


def series_out  = new File("website/data/series.list.gz")
def movies_out  = new File("website/data/movies.txt.gz")
def thetvdb_out = new File("website/data/thetvdb.txt.gz")

def gz(file, lines) {
	file.withOutputStream{ out ->
		new java.util.zip.GZIPOutputStream(out).withWriter('UTF-8'){ writer ->
			lines.each{ writer.append(it).append('\n') }
		}
	}
}


// ------------------------------------------------------------------------- //


// LOAD osdb-imdb.txt (already verified data)
def imdb_tsv = new File("osdb-imdb.txt")
def imdb = [].asSynchronized() // thread-safe list

imdb_tsv.getText('UTF-8').eachLine{
	imdb << it.split(/\t/)
}
imdb_ids = new HashSet(imdb.collect{ it[0] })

// BUILD movies.txt.gz
def osdb_tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php")
def osdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)
osdb_tsv.getText('UTF-8').eachLine{
	def line = it.split(/\t/)*.replaceAll(/\s+/, ' ')*.trim()
	if (line.size() == 4 && line[0] =~ /\d+/) {
		osdb << [line[1].toInteger(), line[2], line[3].toInteger()]
	}
}
new File('omdb.txt').eachLine{
	def line = it.split(/\t/)
	if (line.length > 11 && line[5] =~ /h/ && line[3].toInteger() >= 1970 && (tryQuietly{ line[11].toFloat() } ?: 0) > 1 && (tryQuietly{ line[12].toInteger() } ?: 0) >= 10) {
		line = line*.replaceAll(/\s+/, ' ')*.trim()
		osdb << [line[0].toInteger(), line[2], line[3].toInteger()]
	}
}

osdb = osdb.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] }

parallel(osdb.collect{ row ->
	return {		
		// update new data
		if (!imdb_ids.contains(row[0])) {
			// get original title and english title
			[Locale.ROOT, Locale.ENGLISH].collect{ locale -> net.sourceforge.filebot.WebServices.IMDb.getMovieDescriptor(row[0] as int, locale) }.unique{ it as String }.each { mov ->
				if (mov != null && mov.name.length() > 0 && mov.year > 0) {
					println "Adding $mov"
					imdb << [row[0], mov.name, mov.year]
				} else {
					println "Blacklisting $row"
					imdb << [row[0], null]
				}
			}
		}
	}
}, 20)

// save updated imdb data
imdb.collect{ it.join('\t') }.sort().join('\n').saveAs(imdb_tsv)

// save movie data
def movies = imdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }

def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), it) }
movies = movieSorter.values().collect{ it.join('\t') }

gz(movies_out, movies)
println "Movie Count: " + movies.size()


// ------------------------------------------------------------------------- //

// BUILD thetvdb-index.gz
def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() }
.findResults{ [it.TD[2].text(), it.TD[0].A.text()] }

thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].empty) } as HashSet

// join and sort
def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort()
gz(thetvdb_out, thetvdb)
println "TheTVDB Index: " + thetvdb.size()


// BUILD series.list.gz

// TheTVDB
def thetvdb_names = thetvdb_index.findResults{ it[1] }

// AniDB
def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten()

/*
// IMDb series list
def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] }

// Dokuwiki list
def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name')
def doku_names = []
dokuwiki_index.getText('UTF-8').eachLine{
	doku_names << it.trim().replaceTrailingBrackets()
}
*/

def names = [thetvdb_names, anidb_names]
names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check
names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names

def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER)
seriesSorter.addAll(names)
names = seriesSorter as List


gz(series_out, names)
println "Series Count: " + names.size()
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`// filebot -script BuildData.groovy`
* various little fixes / improvements 2012-02-24 08:39:32 -05:00
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00
UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php 2012-08-10 21:20:19 -04:00			`// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php`
			`@Grab('org.jsoup:jsoup')`
			`import org.jsoup.*`

			`def sgl = []`
			`for (def page = 0; true; page++) {`
			`def dom = Jsoup.parse(new URL('http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page), 10000)`
			`def table = dom.select("table[border=1] tr").collect{ it.select("td").text().trim() }.findAll{ it[2] =~ /\d{4}/ }`
			`sgl += table`
			`if (table.empty) break`
			`}`
			`sgl = sgl.findAll{ it[1] =~ /\b(DVD\|DVDR\|HD\|TV)\b/ && it[0] =~ /\p{Alpha}/}.findResults{ it[0] }`
			`sgl = sgl.collect{ it.before(/ - /).trim().space('.') }.collect{ it ==~ /\p{Upper}?\p{Lower}+/ ? it.toUpperCase() : it }`

			`// append release group names`
			`new File('website/data/release-groups.txt') << '\n' << sgl.join('\n')`


			`// ------------------------------------------------------------------------- //`


* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def sortRegexList(path) {`
			`def set = new TreeSet(String.CASE_INSENSITIVE_ORDER)`
* added extra release info 2012-07-15 22:36:49 -04:00			`new File(path).eachLine('UTF-8'){`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`// check if regex compiles`
			`set += java.util.regex.Pattern.compile(it).pattern()`
			`}`

			`def out = set.join('\n').saveAs(path)`
			`println "$out\n$out.text\n"`
			`}`


			`// sort and check shared regex collections`
			`sortRegexList("website/data/release-groups.txt")`
			`sortRegexList("website/data/query-blacklist.txt")`
* switch to using an online exclude pattern list that can be updated anytime for everybody 2012-07-26 04:45:15 -04:00			`sortRegexList("website/data/exclude-blacklist.txt")`
+ basic logic for hard-coding filename->series lookup (designed primarily as a workaround for database search limitations and issues) 2012-10-09 11:04:14 -04:00			`sortRegexList("website/data/series-mappings.txt")`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00

			`// ------------------------------------------------------------------------- //`


+ TheTVDB: extend API search with LocalSearch from cached database index 2012-10-14 07:57:25 -04:00			`def series_out = new File("website/data/series.list.gz")`
			`def movies_out = new File("website/data/movies.txt.gz")`
			`def thetvdb_out = new File("website/data/thetvdb.txt.gz")`
* update docs & samples 2012-02-26 12:02:54 -05:00
			`def gz(file, lines) {`
			`file.withOutputStream{ out ->`
* added extra release info 2012-07-15 22:36:49 -04:00			`new java.util.zip.GZIPOutputStream(out).withWriter('UTF-8'){ writer ->`
* update docs & samples 2012-02-26 12:02:54 -05:00			`lines.each{ writer.append(it).append('\n') }`
			`}`
			`}`
			`}`


			`// ------------------------------------------------------------------------- //`


* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`// LOAD osdb-imdb.txt (already verified data)`
* deploy data files into FRS to ease load on projectweb 2012-10-11 09:50:37 -04:00			`def imdb_tsv = new File("osdb-imdb.txt")`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def imdb = [].asSynchronized() // thread-safe list`
* update docs & samples 2012-02-26 12:02:54 -05:00
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`imdb_tsv.getText('UTF-8').eachLine{`
			`imdb << it.split(/\t/)`
			`}`
			`imdb_ids = new HashSet(imdb.collect{ it[0] })`

			`// BUILD movies.txt.gz`
			`def osdb_tsv = new URL("http://www.opensubtitles.org/addons/export_movie.php")`
* include omdb dump in movie index * allow foreach for AssociativeScriptObjects * added sysenv script used for printing out system properties and environment variables 2012-12-05 04:36:30 -05:00			`def osdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator)`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`osdb_tsv.getText('UTF-8').eachLine{`
* update docs & samples 2012-02-26 12:02:54 -05:00			`def line = it.split(/\t/).replaceAll(/\s+/, ' ').trim()`
			`if (line.size() == 4 && line[0] =~ /\d+/) {`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`osdb << [line[1].toInteger(), line[2], line[3].toInteger()]`
* update docs & samples 2012-02-26 12:02:54 -05:00			`}`
			`}`
* include omdb dump in movie index * allow foreach for AssociativeScriptObjects * added sysenv script used for printing out system properties and environment variables 2012-12-05 04:36:30 -05:00			`new File('omdb.txt').eachLine{`
			`def line = it.split(/\t/)`
			`if (line.length > 11 && line[5] =~ /h/ && line[3].toInteger() >= 1970 && (tryQuietly{ line[11].toFloat() } ?: 0) > 1 && (tryQuietly{ line[12].toInteger() } ?: 0) >= 10) {`
			`line = line.replaceAll(/\s+/, ' ').trim()`
			`osdb << [line[0].toInteger(), line[2], line[3].toInteger()]`
			`}`
			`}`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00
* include omdb dump in movie index * allow foreach for AssociativeScriptObjects * added sysenv script used for printing out system properties and environment variables 2012-12-05 04:36:30 -05:00			`osdb = osdb.findAll{ it[0] <= 9999999 && it[2] >= 1930 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] }`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00
* revert to Groovy 2.0.2 to fix scripting compatibility issues 2012-11-05 04:41:48 -05:00			`parallel(osdb.collect{ row ->`
			`return {`
			`// update new data`
			`if (!imdb_ids.contains(row[0])) {`
			`// get original title and english title`
			`[Locale.ROOT, Locale.ENGLISH].collect{ locale -> net.sourceforge.filebot.WebServices.IMDb.getMovieDescriptor(row[0] as int, locale) }.unique{ it as String }.each { mov ->`
			`if (mov != null && mov.name.length() > 0 && mov.year > 0) {`
			`println "Adding $mov"`
			`imdb << [row[0], mov.name, mov.year]`
			`} else {`
			`println "Blacklisting $row"`
			`imdb << [row[0], null]`
			`}`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`}`
			`}`
			`}`
* revert to Groovy 2.0.2 to fix scripting compatibility issues 2012-11-05 04:41:48 -05:00			`}, 20)`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00
			`// save updated imdb data`
* sort 2012-07-20 10:09:44 -04:00			`imdb.collect{ it.join('\t') }.sort().join('\n').saveAs(imdb_tsv)`
* update docs & samples 2012-02-26 12:02:54 -05:00
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`// save movie data`
			`def movies = imdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') }`
* update docs & samples 2012-02-26 12:02:54 -05:00
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)`
* index original title as well as engish AKA title 2012-07-20 07:45:18 -04:00			`movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), it) }`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`movies = movieSorter.values().collect{ it.join('\t') }`

			`gz(movies_out, movies)`
* update docs & samples 2012-02-26 12:02:54 -05:00			`println "Movie Count: " + movies.size()`


			`// ------------------------------------------------------------------------- //`

+ TheTVDB: extend API search with LocalSearch from cached database index 2012-10-14 07:57:25 -04:00			`// BUILD thetvdb-index.gz`
			`def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')`
			`def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8')`
			`.depthFirst().TABLE.find{it['@id'] == "listtable"}`
			`.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() }`
			`.findResults{ [it.TD[2].text(), it.TD[0].A.text()] }`

* improve stability 2012-12-03 13:08:02 -05:00			`thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ \|\| it[1] =~ /\d{6,}/ \|\| it[1].startsWith('') \|\| it[1].endsWith('') \|\| it[1].empty) } as HashSet`

+ TheTVDB: extend API search with LocalSearch from cached database index 2012-10-14 07:57:25 -04:00			`// join and sort`
			`def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort()`
			`gz(thetvdb_out, thetvdb)`
			`println "TheTVDB Index: " + thetvdb.size()`

* update docs & samples 2012-02-26 12:02:54 -05:00
			`// BUILD series.list.gz`
* added extra release info 2012-07-15 22:36:49 -04:00
			`// TheTVDB`
+ TheTVDB: extend API search with LocalSearch from cached database index 2012-10-14 07:57:25 -04:00			`def thetvdb_names = thetvdb_index.findResults{ it[1] }`
* performance improvements / switch to series.list.gz * use before-rule when cleaning up tokens from movie filenames * added series.list.gz script 2012-02-23 13:48:35 -05:00
* added extra release info 2012-07-15 22:36:49 -04:00			`// AniDB`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten()`

			`/*`
* added extra release info 2012-07-15 22:36:49 -04:00			`// IMDb series list`
			`def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] }`

			`// Dokuwiki list`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name')`
			`def doku_names = []`
			`dokuwiki_index.getText('UTF-8').eachLine{`
			`doku_names << it.trim().replaceTrailingBrackets()`
			`}`
			`*/`
* performance improvements / switch to series.list.gz * use before-rule when cleaning up tokens from movie filenames * added series.list.gz script 2012-02-23 13:48:35 -05:00
* added extra release info 2012-07-15 22:36:49 -04:00			`def names = [thetvdb_names, anidb_names]`
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check`
* fine-tuning 2012-10-15 08:10:36 -04:00			`names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names`
* fix bugs in the multi-episode logic 2012-03-20 14:18:34 -04:00
* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER)`
			`seriesSorter.addAll(names)`
			`names = seriesSorter as List`
* performance improvements / switch to series.list.gz * use before-rule when cleaning up tokens from movie filenames * added series.list.gz script 2012-02-23 13:48:35 -05:00

* build my own imdb index from osdb movie data for ids and using my imdb scraper to get the original aka names * lots of extra RG names and blacklisted terms (esp useful for dokus) * updated cleaner script to handle video clutter like samples etc 2012-07-13 07:41:50 -04:00			`gz(series_out, names)`
* performance improvements / switch to series.list.gz * use before-rule when cleaning up tokens from movie filenames * added series.list.gz script 2012-02-23 13:48:35 -05:00			`println "Series Count: " + names.size()`