filebot/build-data/BuildData.groovy

#!/usr/bin/env filebot -script

import java.util.regex.*
import org.tukaani.xz.*


/* ------------------------------------------------------------------------- */


def dir_root    = ".."
def dir_website = "${dir_root}/website"
def dir_data    = "${dir_website}/data"

new File(dir_data).mkdirs()

// sort and check shared regex collections
['add-series-alias.txt', 
 'exclude-blacklist.txt', 
 'query-blacklist.txt', 
 'release-groups.txt', 
 'series-mappings.txt'
].each{
	def input = new URL("https://raw.githubusercontent.com/filebot/data/master/${it}")
	def output = new File("${dir_data}/${it}")

	log.fine "Fetch $input"
	def lines = new TreeSet(String.CASE_INSENSITIVE_ORDER)
	input.getText('UTF-8').split(/\R/)*.trim().findAll{ it.length() > 0 }.each{
		lines << Pattern.compile(it).pattern()
	}

	lines.each{ log.finest "$it" }
	pack(output, lines)
}


/* ------------------------------------------------------------------------- */


def reviews = []
new File("${dir_root}/reviews.tsv").eachLine('UTF-8'){
	def s = it.split(/\t/, 3)*.trim()
	reviews << [user: s[0], date: s[1], text: s[2]]
}
reviews = reviews.sort{ it.date }

def json = new groovy.json.JsonBuilder()
json.call(reviews as List)
json.toPrettyString().saveAs("${dir_website}/reviews.json")
log.info "Reviews: " + reviews.size()


/* ------------------------------------------------------------------------- */


def moviedb_out = new File("${dir_data}/moviedb.txt")
def thetvdb_out = new File("${dir_data}/thetvdb.txt")
def anidb_out   = new File("${dir_data}/anidb.txt")
def osdb_out    = new File("${dir_data}/osdb.txt")


def pack(file, lines) {
	file.withOutputStream{ out ->
		out.withWriter('UTF-8'){ writer ->
			lines.each{ writer.append(it).append('\n') }
		}
	}
	new File(file.parentFile, file.name + '.xz').withOutputStream{ out ->
		new XZOutputStream(out, new LZMA2Options(LZMA2Options.PRESET_DEFAULT)).withWriter('UTF-8'){ writer ->
			lines.each{ writer.append(it).append('\n') }
		}
	}
	def rows = lines.size()
	def columns = lines.collect{ it.split(/\t/).length }.max()
	log.info "${file.canonicalFile} ($rows rows, $columns columns)"
}


/* ------------------------------------------------------------------------- */


def isValidMovieName(s) {
	return (s.normalizePunctuation().length() >= 4) || (s=~ /^[A-Z0-9]/ && s =~ /[\p{Alnum}]{3}/)
}

def getNamePermutations(names) {
	def normalize = { s -> s.toLowerCase().normalizePunctuation() }.memoize()

	def out = names*.trim().unique().collectMany{ original ->
		def s = original.trim()
		s = s.replaceAll(/([,]\s(The|A)$)/, '')
		s = s.replaceAll(/\s&\s/, ' and ')
		s = s.replaceAll(/\s\([^\)]*\)$/, '').trim()
		s = s.replaceAll(/^(?i:The|A)\s/, '').trim()
		return [original, s]
	}.unique{ normalize(it) }.findAll{ it.length() > 0 }

	out = out.findAll{ it.length() >= 2 && !(it ==~ /[1][0-9][1-9]/) && !(it =~ /^[a-z]/) && it =~ /^[@.\p{L}\p{Digit}]/ } // MUST START WITH UNICODE LETTER
	out = out.findAll{ !MediaDetection.releaseInfo.structureRootPattern.matcher(it).matches() } // IGNORE NAMES THAT OVERLAP WITH MEDIA FOLDER NAMES
	// out = out.findAll{ a -> names.take(1).contains(a) || out.findAll{ b -> normalize(a).startsWith(normalize(b) + ' ') }.size() == 0 } // TRY TO EXCLUDE REDUNDANT SUBSTRING DUPLICATES

	return out
}

def treeSort(list, keyFunction) {
	def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
	list.each{
		sorter.put(keyFunction(it), it)
	}
	return sorter.values()
}

def csv(f, delim, keyIndex, valueIndex) {
	def values = [:]
	if (f.isFile()) {
		f.splitEachLine(delim, 'UTF-8') { line ->
			values.put(line[keyIndex], tryQuietly{ line[valueIndex] })
		}
	}
	return values
}


/* ------------------------------------------------------------------------- */

if (_args.mode == /no-index/) {
	return
}

/* ------------------------------------------------------------------------- */


// BUILD moviedb index
def omdb = []
new File('omdbMovies.txt').eachLine('Windows-1252'){
	def line = it.split(/\t/)
	if (line.length > 11 && line[0] ==~ /\d+/ && line[3] ==~ /\d{4}/) {
		def imdbid = line[1].substring(2).toInteger()
		def name = line[2].replaceAll(/\s+/, ' ').trim()
		def year = line[3].toInteger()
		def runtime = line[5]
		def genres = line[6]
		def rating = tryQuietly{ line[12].toFloat() } ?: 0
		def votes = tryQuietly{ line[13].replaceAll(/\D/, '').toInteger() } ?: 0

		if (!(genres =~ /Short/ || votes <= 100 || rating <= 2) && ((year >= 1970 && (runtime =~ /(\d.h)|(\d{2,3}.min)/ || votes >= 1000)) || (year >= 1950 && votes >= 20000))) {
			omdb << [imdbid.pad(7), name, year]
		}
	}
}
omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }


def tmdb_txt = new File('tmdb.txt')
def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])

def tmdb = []
omdb.each{ m ->
	def sync = System.currentTimeMillis()
	if (tmdb_index.containsKey(m[0]) && (sync - tmdb_index[m[0]][0].toLong()) < ((m[2].toInteger() < 2000 ? 360 : 120) * 24 * 60 * 60 * 1000L) ) {
		tmdb << tmdb_index[m[0]]
		return
	}
	try {
		def info = WebServices.TheMovieDB.getMovieInfo("tt${m[0]}", Locale.ENGLISH, true)

		if (info.votes <= 1 || info.rating <= 2)
			throw new IllegalArgumentException('Insufficient movie data: ' + info)

		def names = [info.name, info.originalName] + info.alternativeTitles
		[info?.released?.year, m[2]].findResults{ it?.toInteger() }.unique().each{ y ->
			def row = [sync, m[0].pad(7), info.id.pad(7), y.pad(4)] + names
			log.info "Update ${m[0..2]}: $row"
			tmdb << row
		}
	} catch(IllegalArgumentException | FileNotFoundException e) {
		printException(e, false)
		def row = [sync, m[0].pad(7), 0, m[2], m[1]]
		log.info "[BAD] Update ${m[0..2]}: $row"
		tmdb << row
	}
}
tmdb*.join('\t').join('\n').saveAs(tmdb_txt)

movies = tmdb.findResults{
	def ity = it[1..3] // imdb id, tmdb id, year
	def names = getNamePermutations(it[4..-1]).findAll{ isValidMovieName(it) }
	if (ity[0].toInteger() > 0 && ity[1].toInteger() > 0 && names.size() > 0)
		return ity + names
	else
		return null
}
movies = treeSort(movies, { it[3, 2].join(' ') })

// sanity check
if (movies.size() < 20000) { die('Movie index sanity failed:' + movies.size()) }
pack(moviedb_out, movies*.join('\t'))


/* ------------------------------------------------------------------------- */


// BUILD tvdb index
def tvdb_txt = new File('tvdb.txt')
def tvdb = [:]

if (tvdb_txt.exists()) {
	tvdb_txt.eachLine('UTF-8'){
		def line = it.split('\t').toList()
		def names = line.subList(5, line.size())
		tvdb.put(line[1] as Integer, [line[0] as Long, line[1] as Integer, line[2], line[3] as Float, line[4] as Float] + names)
	}
}

def tvdb_updates = [:] as TreeMap
new File('updates_all.xml').eachLine('UTF-8'){
	def m = (it =~ '<Series><id>(\\d+)</id><time>(\\d+)</time></Series>')
	while(m.find()) {
		def id = m.group(1) as Integer
		def time = m.group(2) as Integer
		tvdb_updates[id] = [id: id, time: time]
	}
}

// blacklist crap entries
tvdb_updates.remove(219901)
tvdb_updates.remove(256135)


tvdb_updates.values().each{ update ->
	if (tvdb[update.id] == null || update.time > tvdb[update.id][0]) {
		try {
			retry(2, 60000) {
				def seriesNames = []
				def xml = new XmlSlurper().parse("http://thetvdb.com/api/BA864DEE427E384A/series/${update.id}/en.xml")
				def imdbid = xml.Series.IMDB_ID.text()
				seriesNames += xml.Series.SeriesName.text()

				def rating = tryQuietly{ xml.Series.Rating.text().toFloat() }
				def votes = tryQuietly{ xml.Series.RatingCount.text().toFloat() }

				// only retrieve additional data for reasonably popular shows
				if (votes >= 5 && rating >= 4) {
					tryLogCatch{
						if (imdbid =~ /tt(\d+)/) {
							seriesNames += OMDb.getMovieDescriptor(new Movie(null, 0, imdbid.match(/tt(\d+)/) as int, -1), Locale.ENGLISH).getName()
						}
					}

					// scrape extra alias titles from webpage (not supported yet by API)
					def html = Cache.getCache('thetvdb_series_page', CacheType.Persistent).text(update.id) { 
						return new URL("http://thetvdb.com/?tab=series&id=${it}")
					}.expire(Cache.ONE_MONTH).get()

					def jsoup = org.jsoup.Jsoup.parse(html)
					def akaseries = jsoup.select('#akaseries table tr table tr')
												.findAll{ it.select('td').any{ it.text() ==~ /en/ } }
												.findResults{ it.select('td').first().text() }
												.findAll{ it?.length() > 0 }
					def intlseries = jsoup.select('#seriesform input')
												.findAll{ it.attr('name') =~ /SeriesName/ }
												.sort{ it.attr('name').match(/\d+/) as int }
												.collect{ it.attr('value') }
												.findAll{ it?.length() > 0 }

					log.info "Scraped data $akaseries and $intlseries for series $seriesNames"
					seriesNames += akaseries
					seriesNames += intlseries
				}

				def data = [update.time, update.id, imdbid, rating ?: 0, votes ?: 0] + seriesNames.findAll{ it != null && it.length() > 0 }
				tvdb.put(update.id, data)
				log.info "Update $update => $data"
			}
		}
		catch(Throwable e) {
			printException(e, false)
			def data = [update.time, update.id, '', 0, 0]
			tvdb.put(update.id, data)
			log.info "[BAD] Update $update => $data"
		}
	}
}

// remove entries that have become invalid
tvdb.keySet().toList().each{ id ->
	if (tvdb_updates[id] == null) {
		log.info "Invalid ID found: ${tvdb[id]}"
		tvdb.remove(id)
	}
}
tvdb.values().findResults{ it.collect{ it.toString().replace('\t', '').trim() }.join('\t') }.join('\n').saveAs(tvdb_txt)

// additional custom mappings
def extraAliasNames = csv("${dir_data}/add-series-alias.txt")

def thetvdb_index = []
tvdb.values().each{ r ->
	def tvdb_id = r[1]
	def rating = r[3]
	def votes = r[4]
	def names = r.subList(5, r.size())

	def alias = extraAliasNames[names[0]]
	if (alias) {
		log.fine "Add alias ${names[0]} => ${alias}"
		names += alias
	}

	if (alias !=null || (votes >= 5 && rating >= 4) || (votes >= 2 && rating >= 6) || (votes >= 1 && rating >= 10)) {
		getNamePermutations(names).each{ n ->
			thetvdb_index << [tvdb_id, n]
		}
	}
}

thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate|Series.Not.Permitted)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
thetvdb_index = thetvdb_index.sort{ a, b -> a[0] <=> b[0] }

// join and sort
def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique{ it.toLowerCase() }).join('\t') }

// sanity check
if (thetvdb_txt.size() < 4000) { die('TheTVDB index sanity failed: ' + thetvdb_txt.size()) }
pack(thetvdb_out, thetvdb_txt)


/* ------------------------------------------------------------------------- */


// BUILD osdb index
def osdb = []

new File('osdb.txt').eachLine('UTF-8'){
	def fields = it.split(/\t/)*.trim()

	// 0 IDMovie, 1 IDMovieImdb, 2 MovieName, 3 MovieYear, 4 MovieKind, 5 MoviePriority
	if (fields.size() == 6 && fields[1] ==~ /\d+/ && fields[3] ==~ /\d{4}/) {
		if (fields[4] ==~ /movie|tv.series/ && isValidMovieName(fields[2]) && (fields[3] as int) >= 1970 && (fields[5] as int) >= 500) {
			// 0 imdbid, 1 name, 2 year, 3 kind, 4 priority
			osdb << [fields[1] as int, fields[2], fields[3] as int, fields[4] == /movie/ ? 'm' : fields[4] == /tv series/ ? 's' : '?', fields[5] as int]
		}
	}
}

// sort reverse by score
osdb.sort{ a, b -> b[4] <=> a[4] }

// reset score/priority because it's currently not used
osdb*.set(4, 0)

// map by imdbid
def tvdb_index = tvdb.values().findAll{ it[2] =~ /tt(\d+)/ }.collectEntries{ [it[2].substring(2).pad(7), it] }

// collect final output data
osdb = osdb.findResults{
	def names = [it[1]]
	if (it[3] == 'm') {
		def tmdb_entry = tmdb_index[it[0].pad(7)]
		if (tmdb_entry != null && tmdb_entry.size() > 4) {
			names += tmdb_entry[4..-1]
		}
	} else if (it[3] == 's') {
		def tvdb_entry = tvdb_index[it[0].pad(7)]
		if (tvdb_entry != null && tvdb_entry.size() > 5) {
			names += tvdb_entry[5..-1]
		}
	}
	// 0 kind, 1 score, 2 imdbid, 3 year, 4-n names
	return [it[3], it[4], it[0], it[2]] + names.unique()
}

// sanity check
if (osdb.size() < 15000) { die('OSDB index sanity failed:' + osdb.size()) }
pack(osdb_out, osdb*.join('\t'))


/* ------------------------------------------------------------------------- */


// BUILD anidb index
def anidb = new AnidbClient('filebot', 6).getAnimeTitles() as List
def animeExcludes = [] as Set

// exclude anime movies from anime index
new File('anime-list.xml').eachLine('UTF-8') {
    if (it =~ /tvdbid="movie"/ || it =~ /imdbid="ttd\+"/) {
        animeExcludes << it.match(/anidbid="(\d+)"/).toInteger()
    }
}

def anidb_index = anidb.findResults{
	if (animeExcludes.contains(it.id))
		return null

	def names = it.effectiveNames*.replaceAll(/\s+/, ' ')*.trim()*.replaceAll(/['`´‘’ʻ]+/, /'/)
	names = getNamePermutations(names)
	names = names.findAll{ stripReleaseInfo(it)?.length() > 0 }

	return names.empty ? null : [it.id.pad(5)] + names.take(4)
}

// join and sort
def anidb_txt = anidb_index.findResults{ row -> row.join('\t') }.sort().unique()

// sanity check
if (anidb_txt.size() < 8000 || animeExcludes.size() < 500) { die('AniDB index sanity failed:' + anidb_txt.size()) }
pack(anidb_out, anidb_txt)
-												* added build-data scripts
											
										
										
											2015-05-11 06:35:45 -04:00
+								#!/usr/bin/env filebot -script
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:33:04 -05:00
+								import java.util.regex.*
-												* fine-tune exclude patterns
											
										
										
											2014-01-08 03:36:32 -05:00
+								import org.tukaani.xz.*
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:27:31 -05:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:27:31 -05:00
-												* implement guessMovie feature
											
										
										
											2015-05-11 09:57:04 -04:00
+								def dir_root    = ".."
 								def dir_website = "${dir_root}/website"
-												* added build-data scripts
											
										
										
											2015-05-11 06:35:45 -04:00
+								def dir_data    = "${dir_website}/data"
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												Added -Durl.refresh=PT0S system property for testing
											
										
										
											2016-03-12 08:46:42 -05:00
+								new File(dir_data).mkdirs()
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:27:31 -05:00
+								// sort and check shared regex collections
 								['add-series-alias.txt',
 								 'exclude-blacklist.txt',
 								 'query-blacklist.txt',
 								 'release-groups.txt',
 								 'series-mappings.txt'
 								].each{
 									def input = new URL("https://raw.githubusercontent.com/filebot/data/master/${it}")
 									def output = new File("${dir_data}/${it}")
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+									log.fine "Fetch $input"
-												Experiment with new CachedResource framework
											
										
										
											2016-03-08 07:59:24 -05:00
+									def lines = new TreeSet(String.CASE_INSENSITIVE_ORDER)
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:38:03 -05:00
+									input.getText('UTF-8').split(/\R/)*.trim().findAll{ it.length() > 0 }.each{
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+										lines << Pattern.compile(it).pattern()
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+									}
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+									lines.each{ log.finest "$it" }
-												Experiment with new CachedResource framework
											
										
										
											2016-03-08 07:59:24 -05:00
+									pack(output, lines)
-												Move data file maintenance to GitHub
											
										
										
											2016-02-24 14:27:31 -05:00
+								}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
 								def reviews = []
-												* format reviews
											
										
										
											2015-05-27 12:47:17 -04:00
+								new File("${dir_root}/reviews.tsv").eachLine('UTF-8'){
-												Fix CSV format issues

											
										
										
											2016-04-25 03:23:12 -04:00
+									def s = it.split(/\t/, 3)*.trim()
-												* format reviews
											
										
										
											2015-05-27 12:47:17 -04:00
+									reviews << [user: s[0], date: s[1], text: s[2]]
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								reviews = reviews.sort{ it.date }
 								def json = new groovy.json.JsonBuilder()
 								json.call(reviews as List)
-												* added build-data scripts
											
										
										
											2015-05-11 06:35:45 -04:00
+								json.toPrettyString().saveAs("${dir_website}/reviews.json")
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
+								log.info "Reviews: " + reviews.size()
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												* added build-data scripts
											
										
										
											2015-05-11 06:35:45 -04:00
+								def moviedb_out = new File("${dir_data}/moviedb.txt")
 								def thetvdb_out = new File("${dir_data}/thetvdb.txt")
 								def anidb_out   = new File("${dir_data}/anidb.txt")
 								def osdb_out    = new File("${dir_data}/osdb.txt")
-												+ use OpenSubtitles dump for OpenSubtitles local search
											
										
										
											2015-05-11 05:13:35 -04:00
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												* use XZ Utils for packing online database files
											
										
										
											2013-08-10 03:56:11 -04:00
+								def pack(file, lines) {
-												Update build
											
										
										
											2016-03-09 22:53:49 -05:00
+									file.withOutputStream{ out ->
 										out.withWriter('UTF-8'){ writer ->
 											lines.each{ writer.append(it).append('\n') }
 										}
 									}
-												* use XZ Utils for packing online database files
											
										
										
											2013-08-10 03:56:11 -04:00
+									new File(file.parentFile, file.name + '.xz').withOutputStream{ out ->
 										new XZOutputStream(out, new LZMA2Options(LZMA2Options.PRESET_DEFAULT)).withWriter('UTF-8'){ writer ->
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+											lines.each{ writer.append(it).append('\n') }
 										}
 									}
-												* update local db files
											
										
										
											2014-01-07 10:21:38 -05:00
+									def rows = lines.size()
 									def columns = lines.collect{ it.split(/\t/).length }.max()
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
+									log.info "${file.canonicalFile} ($rows rows, $columns columns)"
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								}
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-21 09:31:31 -05:00
+								def isValidMovieName(s) {
-												* fix movie index
											
										
										
											2014-03-06 10:50:14 -05:00
+									return (s.normalizePunctuation().length() >= 4) || (s=~ /^[A-Z0-9]/ && s =~ /[\p{Alnum}]{3}/)
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-21 09:31:31 -05:00
+								}
-												+ make movie matching properly aware of alias names
											
										
										
											2014-01-06 18:22:31 -05:00
+								def getNamePermutations(names) {
-												* revert, and keep this unless it turns out unreliable
											
										
										
											2014-09-15 19:49:19 -04:00
+									def normalize = { s -> s.toLowerCase().normalizePunctuation() }.memoize()
-												+ make movie matching properly aware of alias names
											
										
										
											2014-01-06 18:22:31 -05:00
-												* fix movie index for name variations
											
										
										
											2014-09-11 16:04:24 -04:00
+									def out = names*.trim().unique().collectMany{ original ->
-												Don't add non The|A alias titles if what remains is a single word (e.g. The Voice VS Voice)
											
										
										
											2016-03-12 12:02:13 -05:00
+										def s = original.trim()
-												Fix ranking issues
											
										
										
											2016-03-12 13:15:07 -05:00
+										s = s.replaceAll(/([,]\s(The|A)$)/, '')
-												Don't add non The|A alias titles if what remains is a single word (e.g. The Voice VS Voice)
											
										
										
											2016-03-12 12:02:13 -05:00
+										s = s.replaceAll(/\s&\s/, ' and ')
 										s = s.replaceAll(/\s\([^\)]*\)$/, '').trim()
-												Fix ranking issues
											
										
										
											2016-03-12 13:15:07 -05:00
+										s = s.replaceAll(/^(?i:The|A)\s/, '').trim()
 										return [original, s]
-												* reduce index size by reducing entries with redundant substring sequences
											
										
										
											2014-09-15 15:04:25 -04:00
+									}.unique{ normalize(it) }.findAll{ it.length() > 0 }
-												* custom add aliases
											
										
										
											2014-03-18 16:08:06 -04:00
+									out = out.findAll{ it.length() >= 2 && !(it ==~ /[1][0-9][1-9]/) && !(it =~ /^[a-z]/) && it =~ /^[@.\p{L}\p{Digit}]/ } // MUST START WITH UNICODE LETTER
-												* fine-tune media index
											
										
										
											2014-01-11 04:04:49 -05:00
+									out = out.findAll{ !MediaDetection.releaseInfo.structureRootPattern.matcher(it).matches() } // IGNORE NAMES THAT OVERLAP WITH MEDIA FOLDER NAMES
-												* not simplifying alias list for each entry should't cause any noticeable performance differences
											
										
										
											2014-10-10 14:17:31 -04:00
+									// out = out.findAll{ a -> names.take(1).contains(a) || out.findAll{ b -> normalize(a).startsWith(normalize(b) + ' ') }.size() == 0 } // TRY TO EXCLUDE REDUNDANT SUBSTRING DUPLICATES
-												* revert, and keep this unless it turns out unreliable
											
										
										
											2014-09-15 19:49:19 -04:00
-												* reduce index size by reducing entries with redundant substring sequences
											
										
										
											2014-09-15 15:04:25 -04:00
+									return out
-												+ make movie matching properly aware of alias names
											
										
										
											2014-01-06 18:22:31 -05:00
+								}
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								def treeSort(list, keyFunction) {
 									def sorter = new TreeMap(String.CASE_INSENSITIVE_ORDER)
 									list.each{
 										sorter.put(keyFunction(it), it)
 									}
 									return sorter.values()
 								}
-												+ Groovy engine extensions rewrite complete :)
											
										
										
											2014-04-18 15:41:39 -04:00
+								def csv(f, delim, keyIndex, valueIndex) {
 									def values = [:]
 									if (f.isFile()) {
 										f.splitEachLine(delim, 'UTF-8') { line ->
 											values.put(line[keyIndex], tryQuietly{ line[valueIndex] })
 										}
 									}
 									return values
 								}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												Added -Durl.refresh=PT0S system property for testing
											
										
										
											2016-03-12 08:46:42 -05:00
+								/* ------------------------------------------------------------------------- */
 								if (_args.mode == /no-index/) {
 									return
 								}
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
+								/* ------------------------------------------------------------------------- */
 								// BUILD moviedb index
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-21 09:31:31 -05:00
+								def omdb = []
-												* implement guessMovie feature
											
										
										
											2015-05-11 09:57:04 -04:00
+								new File('omdbMovies.txt').eachLine('Windows-1252'){
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+									def line = it.split(/\t/)
-												* fix & ignore 1970-1975 style series patterns
											
										
										
											2014-09-26 12:41:42 -04:00
+									if (line.length > 11 && line[0] ==~ /\d+/ && line[3] ==~ /\d{4}/) {
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+										def imdbid = line[1].substring(2).toInteger()
-												* update data files and enable "Movie 43" sample data
											
										
										
											2013-09-10 06:12:55 -04:00
+										def name = line[2].replaceAll(/\s+/, ' ').trim()
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+										def year = line[3].toInteger()
 										def runtime = line[5]
-												* make sure to ignore Short movies from internal index (to reduce messing with similarly named good movies)
											
										
										
											2014-10-15 18:16:11 -04:00
+										def genres = line[6]
-												* update data
											
										
										
											2014-02-18 01:55:45 -05:00
+										def rating = tryQuietly{ line[12].toFloat() } ?: 0
 										def votes = tryQuietly{ line[13].replaceAll(/\D/, '').toInteger() } ?: 0
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
-												* update build-data filter
											
										
										
											2015-11-12 07:20:25 -05:00
+										if (!(genres =~ /Short/ || votes <= 100 || rating <= 2) && ((year >= 1970 && (runtime =~ /(\d.h)|(\d{2,3}.min)/ || votes >= 1000)) || (year >= 1950 && votes >= 20000))) {
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+											omdb << [imdbid.pad(7), name, year]
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+										}
 									}
 								}
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								omdb = omdb.findAll{ (it[0] as int) <= 9999999 && isValidMovieName(it[1]) }
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								def tmdb_txt = new File('tmdb.txt')
 								def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+								def tmdb = []
 								omdb.each{ m ->
-												* support and include TheMovieDB alternative_titles data in search and index
											
										
										
											2014-01-23 13:18:25 -05:00
+									def sync = System.currentTimeMillis()
-												* update index for recent movies more regularly to catch new alternative names quickly
											
										
										
											2014-09-17 03:30:33 -04:00
+									if (tmdb_index.containsKey(m[0]) && (sync - tmdb_index[m[0]][0].toLong()) < ((m[2].toInteger() < 2000 ? 360 : 120) * 24 * 60 * 60 * 1000L) ) {
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+										tmdb << tmdb_index[m[0]]
 										return
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+									}
 									try {
-												* fixit
											
										
										
											2014-08-22 02:59:30 -04:00
+										def info = WebServices.TheMovieDB.getMovieInfo("tt${m[0]}", Locale.ENGLISH, true)
-												* more strict movie indexing
											
										
										
											2014-10-31 06:36:18 -04:00
 										if (info.votes <= 1 || info.rating <= 2)
-												* improved logging
											
										
										
											2015-11-12 06:43:13 -05:00
+											throw new IllegalArgumentException('Insufficient movie data: ' + info)
-												* more strict movie indexing
											
										
										
											2014-10-31 06:36:18 -04:00
-												* support and include TheMovieDB alternative_titles data in search and index
											
										
										
											2014-01-23 13:18:25 -05:00
+										def names = [info.name, info.originalName] + info.alternativeTitles
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+										[info?.released?.year, m[2]].findResults{ it?.toInteger() }.unique().each{ y ->
 											def row = [sync, m[0].pad(7), info.id.pad(7), y.pad(4)] + names
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+											log.info "Update ${m[0..2]}: $row"
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+											tmdb << row
-												* support and include TheMovieDB alternative_titles data in search and index
											
										
										
											2014-01-23 13:18:25 -05:00
+										}
-												* improved logging
											
										
										
											2015-11-12 06:43:13 -05:00
+									} catch(IllegalArgumentException | FileNotFoundException e) {
-												* improved logging
											
										
										
											2015-11-12 06:30:49 -05:00
+										printException(e, false)
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+										def row = [sync, m[0].pad(7), 0, m[2], m[1]]
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+										log.info "[BAD] Update ${m[0..2]}: $row"
-												* improved support for movies with multiple valid release year values
											
										
										
											2014-08-27 14:26:06 -04:00
+										tmdb << row
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+									}
 								}
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-21 09:31:31 -05:00
+								tmdb*.join('\t').join('\n').saveAs(tmdb_txt)
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
 								movies = tmdb.findResults{
 									def ity = it[1..3] // imdb id, tmdb id, year
-												+ make movie matching properly aware of alias names
											
										
										
											2014-01-06 18:22:31 -05:00
+									def names = getNamePermutations(it[4..-1]).findAll{ isValidMovieName(it) }
-												* final fixes for new moviedb index
											
										
										
											2013-11-21 11:31:09 -05:00
+									if (ity[0].toInteger() > 0 && ity[1].toInteger() > 0 && names.size() > 0)
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+										return ity + names
 									else
 										return null
 								}
 								movies = treeSort(movies, { it[3, 2].join(' ') })
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								// sanity check
-												* fix sanity checks
											
										
										
											2014-10-31 11:59:16 -04:00
+								if (movies.size() < 20000) { die('Movie index sanity failed:' + movies.size()) }
-												* fix some GUI movie auto-selection issues
											
										
										
											2014-01-24 12:31:33 -05:00
+								pack(moviedb_out, movies*.join('\t'))
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
 								// BUILD tvdb index
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								def tvdb_txt = new File('tvdb.txt')
-												* account for TVDB entries being removed sometimes because of duplicates
											
										
										
											2013-12-02 13:25:06 -05:00
+								def tvdb = [:]
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
 								if (tvdb_txt.exists()) {
-												* XmlSlurper wastes too much memory
											
										
										
											2014-08-14 13:29:34 -04:00
+									tvdb_txt.eachLine('UTF-8'){
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
+										def line = it.split('\t').toList()
-												* fix fix
											
										
										
											2014-08-16 12:28:40 -04:00
+										def names = line.subList(5, line.size())
 										tvdb.put(line[1] as Integer, [line[0] as Long, line[1] as Integer, line[2], line[3] as Float, line[4] as Float] + names)
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
+									}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								}
-												* fix
											
										
										
											2014-08-15 23:07:51 -04:00
+								def tvdb_updates = [:] as TreeMap
-												* XmlSlurper wastes too much memory
											
										
										
											2014-08-14 13:29:34 -04:00
+								new File('updates_all.xml').eachLine('UTF-8'){
 									def m = (it =~ '<Series><id>(\\d+)</id><time>(\\d+)</time></Series>')
-												* fix efficiency issues
											
										
										
											2014-08-15 22:40:39 -04:00
+									while(m.find()) {
 										def id = m.group(1) as Integer
 										def time = m.group(2) as Integer
 										tvdb_updates[id] = [id: id, time: time]
-												* XmlSlurper wastes too much memory
											
										
										
											2014-08-14 13:29:34 -04:00
+									}
 								}
-												// blacklist crap entries
											
										
										
											2014-12-05 11:21:13 -05:00
+								// blacklist crap entries
-												// blacklist crap entries
											
										
										
											2014-12-05 12:28:56 -05:00
+								tvdb_updates.remove(219901)
 								tvdb_updates.remove(256135)
-												// blacklist crap entries
											
										
										
											2014-12-05 11:21:13 -05:00
-												* XmlSlurper wastes too much memory
											
										
										
											2014-08-14 13:29:34 -04:00
-												* fix efficiency issues
											
										
										
											2014-08-15 22:40:39 -04:00
+								tvdb_updates.values().each{ update ->
-												* fix series index
											
										
										
											2014-03-10 01:34:53 -04:00
+									if (tvdb[update.id] == null || update.time > tvdb[update.id][0]) {
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+										try {
-												* increase timeout for improved automated processing
											
										
										
											2015-02-22 05:52:33 -05:00
+											retry(2, 60000) {
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+												def seriesNames = []
-												+ Groovy engine extensions rewrite complete :)
											
										
										
											2014-04-18 15:41:39 -04:00
+												def xml = new XmlSlurper().parse("http://thetvdb.com/api/BA864DEE427E384A/series/${update.id}/en.xml")
 												def imdbid = xml.Series.IMDB_ID.text()
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+												seriesNames += xml.Series.SeriesName.text()
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
-												+ Groovy engine extensions rewrite complete :)
											
										
										
											2014-04-18 15:41:39 -04:00
+												def rating = tryQuietly{ xml.Series.Rating.text().toFloat() }
-												* fixfix
											
										
										
											2014-05-15 04:18:50 -04:00
+												def votes = tryQuietly{ xml.Series.RatingCount.text().toFloat() }
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+												// only retrieve additional data for reasonably popular shows
 												if (votes >= 5 && rating >= 4) {
 													tryLogCatch{
 														if (imdbid =~ /tt(\d+)/) {
-												* fix for interface changes
											
										
										
											2014-09-15 16:41:51 -04:00
+															seriesNames += OMDb.getMovieDescriptor(new Movie(null, 0, imdbid.match(/tt(\d+)/) as int, -1), Locale.ENGLISH).getName()
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+														}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+													}
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
 													// scrape extra alias titles from webpage (not supported yet by API)
-												Cache html pages (in case build fails or is interrupted so we can quickly resume)
											
										
										
											2016-04-10 11:24:15 -04:00
+													def html = Cache.getCache('thetvdb_series_page', CacheType.Persistent).text(update.id) {
 														return new URL("http://thetvdb.com/?tab=series&id=${it}")
 													}.expire(Cache.ONE_MONTH).get()
 													def jsoup = org.jsoup.Jsoup.parse(html)
-												* include intl. series names in thetvdb index (as far as possible or reasonable)
											
										
										
											2014-11-28 03:27:37 -05:00
+													def akaseries = jsoup.select('#akaseries table tr table tr')
 																				.findAll{ it.select('td').any{ it.text() ==~ /en/ } }
 																				.findResults{ it.select('td').first().text() }
 																				.findAll{ it?.length() > 0 }
-												* fixit
											
										
										
											2014-11-28 10:59:47 -05:00
+													def intlseries = jsoup.select('#seriesform input')
-												* include intl. series names in thetvdb index (as far as possible or reasonable)
											
										
										
											2014-11-28 03:27:37 -05:00
+																				.findAll{ it.attr('name') =~ /SeriesName/ }
 																				.sort{ it.attr('name').match(/\d+/) as int }
 																				.collect{ it.attr('value') }
 																				.findAll{ it?.length() > 0 }
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
 													log.info "Scraped data $akaseries and $intlseries for series $seriesNames"
-												* include intl. series names in thetvdb index (as far as possible or reasonable)
											
										
										
											2014-11-28 03:27:37 -05:00
+													seriesNames += akaseries
 													seriesNames += intlseries
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+												}
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
 												def data = [update.time, update.id, imdbid, rating ?: 0, votes ?: 0] + seriesNames.findAll{ it != null && it.length() > 0 }
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+												tvdb.put(update.id, data)
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
+												log.info "Update $update => $data"
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+											}
 										}
 										catch(Throwable e) {
-												* fix efficiency issues
											
										
										
											2014-08-15 22:40:39 -04:00
+											printException(e, false)
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+											def data = [update.time, update.id, '', 0, 0]
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+											tvdb.put(update.id, data)
-												Support log levels
											
										
										
											2016-03-26 06:25:45 -04:00
+											log.info "[BAD] Update $update => $data"
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+										}
 									}
 								}
-												* account for TVDB entries being removed sometimes because of duplicates
											
										
										
											2013-12-02 13:25:06 -05:00
 								// remove entries that have become invalid
 								tvdb.keySet().toList().each{ id ->
-												* fix efficiency issues
											
										
										
											2014-08-15 22:40:39 -04:00
+									if (tvdb_updates[id] == null) {
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
+										log.info "Invalid ID found: ${tvdb[id]}"
-												* account for TVDB entries being removed sometimes because of duplicates
											
										
										
											2013-12-02 13:25:06 -05:00
+										tvdb.remove(id)
 									}
 								}
-												* fix for TheTVDB data inconsistencies
											
										
										
											2014-05-12 15:18:31 -04:00
+								tvdb.values().findResults{ it.collect{ it.toString().replace('\t', '').trim() }.join('\t') }.join('\n').saveAs(tvdb_txt)
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												// make sure to add known shows to the index regardless of rating
											
										
										
											2016-04-12 05:27:28 -04:00
+								// additional custom mappings
 								def extraAliasNames = csv("${dir_data}/add-series-alias.txt")
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
 								def thetvdb_index = []
-												+ make movie matching properly aware of alias names
											
										
										
											2014-01-06 18:22:31 -05:00
+								tvdb.values().each{ r ->
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
+									def tvdb_id = r[1]
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+									def rating = r[3]
 									def votes = r[4]
-												* fix fix
											
										
										
											2014-08-16 12:28:40 -04:00
+									def names = r.subList(5, r.size())
-												Support log levels
											
										
										
											2016-03-26 06:02:27 -04:00
-												// make sure to add known shows to the index regardless of rating
											
										
										
											2016-04-12 05:27:28 -04:00
+									def alias = extraAliasNames[names[0]]
 									if (alias) {
 										log.fine "Add alias ${names[0]} => ${alias}"
 										names += alias
 									}
 									if (alias !=null || (votes >= 5 && rating >= 4) || (votes >= 2 && rating >= 6) || (votes >= 1 && rating >= 10)) {
-												* update series indexer
											
										
										
											2014-08-15 05:58:42 -04:00
+										getNamePermutations(names).each{ n ->
-												* only include popular or high-rated shows in the series index
											
										
										
											2014-03-09 08:50:03 -04:00
+											thetvdb_index << [tvdb_id, n]
 										}
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+									}
 								}
-												* update exclude rules
											
										
										
											2015-11-02 22:43:54 -05:00
+								thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate|Series.Not.Permitted)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].length() < 2) }
-												* fix for TheTVDB data inconsistencies
											
										
										
											2014-05-12 15:18:31 -04:00
+								thetvdb_index = thetvdb_index.sort{ a, b -> a[0] <=> b[0] }
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
 								// join and sort
-												* allow more than 4 alias titles (e.g. in all languages) and see what happens
											
										
										
											2014-11-28 14:26:57 -05:00
+								def thetvdb_txt = thetvdb_index.groupBy{ it[0] }.findResults{ k, v -> ([k.pad(6)] + v*.getAt(1).unique{ it.toLowerCase() }).join('\t') }
-												* fix the "law.and.order.svu" special-fuckin-corner-case
											
										
										
											2013-12-15 13:35:41 -05:00
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
+								// sanity check
-												* some improvements for amc processing logic
											
										
										
											2014-04-19 12:54:25 -04:00
+								if (thetvdb_txt.size() < 4000) { die('TheTVDB index sanity failed: ' + thetvdb_txt.size()) }
-												* fix some GUI movie auto-selection issues
											
										
										
											2014-01-24 12:31:33 -05:00
+								pack(thetvdb_out, thetvdb_txt)
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												+ rebuild movie index with imdb AND tmdb IDs
											
										
										
											2013-11-20 05:07:25 -05:00
+								/* ------------------------------------------------------------------------- */
-												* generate review.json and easily collect good reviews in a separate csv file
											
										
										
											2013-08-10 01:23:14 -04:00
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+								// BUILD osdb index
 								def osdb = []
 								new File('osdb.txt').eachLine('UTF-8'){
 									def fields = it.split(/\t/)*.trim()
 									// 0 IDMovie, 1 IDMovieImdb, 2 MovieName, 3 MovieYear, 4 MovieKind, 5 MoviePriority
 									if (fields.size() == 6 && fields[1] ==~ /\d+/ && fields[3] ==~ /\d{4}/) {
-												* augment osdb index with thetvdb series alias data
											
										
										
											2015-05-25 04:28:38 -04:00
+										if (fields[4] ==~ /movie|tv.series/ && isValidMovieName(fields[2]) && (fields[3] as int) >= 1970 && (fields[5] as int) >= 500) {
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+											// 0 imdbid, 1 name, 2 year, 3 kind, 4 priority
 											osdb << [fields[1] as int, fields[2], fields[3] as int, fields[4] == /movie/ ? 'm' : fields[4] == /tv series/ ? 's' : '?', fields[5] as int]
 										}
 									}
 								}
 								// sort reverse by score
 								osdb.sort{ a, b -> b[4] <=> a[4] }
 								// reset score/priority because it's currently not used
 								osdb*.set(4, 0)
-												* augment osdb index with thetvdb series alias data
											
										
										
											2015-05-25 04:28:38 -04:00
+								// map by imdbid
 								def tvdb_index = tvdb.values().findAll{ it[2] =~ /tt(\d+)/ }.collectEntries{ [it[2].substring(2).pad(7), it] }
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+								// collect final output data
 								osdb = osdb.findResults{
 									def names = [it[1]]
 									if (it[3] == 'm') {
 										def tmdb_entry = tmdb_index[it[0].pad(7)]
-												* augment osdb index with thetvdb series alias data
											
										
										
											2015-05-25 04:28:38 -04:00
+										if (tmdb_entry != null && tmdb_entry.size() > 4) {
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+											names += tmdb_entry[4..-1]
 										}
-												* augment osdb index with thetvdb series alias data
											
										
										
											2015-05-25 04:28:38 -04:00
+									} else if (it[3] == 's') {
 										def tvdb_entry = tvdb_index[it[0].pad(7)]
 										if (tvdb_entry != null && tvdb_entry.size() > 5) {
 											names += tvdb_entry[5..-1]
 										}
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+									}
-												* augment osdb index with thetvdb series alias data
											
										
										
											2015-05-25 04:28:38 -04:00
+									// 0 kind, 1 score, 2 imdbid, 3 year, 4-n names
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+									return [it[3], it[4], it[0], it[2]] + names.unique()
 								}
 								// sanity check
-												* fix sanity requirements
											
										
										
											2015-06-13 01:03:55 -04:00
+								if (osdb.size() < 15000) { die('OSDB index sanity failed:' + osdb.size()) }
-												* include movie alias titles in osdb index
											
										
										
											2015-05-24 18:54:59 -04:00
+								pack(osdb_out, osdb*.join('\t'))
-												* refactor
											
										
										
											2015-05-25 04:41:33 -04:00
 								/* ------------------------------------------------------------------------- */
 								// BUILD anidb index
-												// make sure to add known shows to the index regardless of rating
											
										
										
											2016-04-12 05:27:28 -04:00
+								def anidb = new AnidbClient('filebot', 6).getAnimeTitles() as List
 								def animeExcludes = [] as Set
-												exclude anime movies from anime index
											
										
										
											2016-02-05 11:31:53 -05:00
 								// exclude anime movies from anime index
 								new File('anime-list.xml').eachLine('UTF-8') {
-												exclude anime movies from anime index
											
										
										
											2016-02-06 07:23:53 -05:00
+								    if (it =~ /tvdbid="movie"/ || it =~ /imdbid="ttd\+"/) {
-												exclude anime movies from anime index
											
										
										
											2016-02-05 11:31:53 -05:00
+								        animeExcludes << it.match(/anidbid="(\d+)"/).toInteger()
 								    }
 								}
-												* refactor
											
										
										
											2015-05-25 04:41:33 -04:00
 								def anidb_index = anidb.findResults{
-												Unify SearchResult classes
											
										
										
											2016-03-26 13:40:54 -04:00
+									if (animeExcludes.contains(it.id))
-												exclude anime movies from anime index
											
										
										
											2016-02-05 11:31:53 -05:00
+										return null
-												* refactor
											
										
										
											2015-05-25 04:41:33 -04:00
+									def names = it.effectiveNames*.replaceAll(/\s+/, ' ')*.trim()*.replaceAll(/['`´‘’ʻ]+/, /'/)
 									names = getNamePermutations(names)
 									names = names.findAll{ stripReleaseInfo(it)?.length() > 0 }
-												Unify SearchResult classes
											
										
										
											2016-03-26 13:40:54 -04:00
+									return names.empty ? null : [it.id.pad(5)] + names.take(4)
-												* refactor
											
										
										
											2015-05-25 04:41:33 -04:00
+								}
 								// join and sort
 								def anidb_txt = anidb_index.findResults{ row -> row.join('\t') }.sort().unique()
 								// sanity check
-												exclude anime movies from anime index
											
										
										
											2016-02-05 11:31:53 -05:00
+								if (anidb_txt.size() < 8000 || animeExcludes.size() < 500) { die('AniDB index sanity failed:' + anidb_txt.size()) }
-												* refactor
											
										
										
											2015-05-25 04:41:33 -04:00
+								pack(anidb_out, anidb_txt)