Refactor BuildData.groovy

2024-11-15 05:45:05 -05:00 · 2016-11-19 20:16:00 +08:00 · 2016-11-19 20:16:00 +08:00 · a424b55b8e
commit a424b55b8e
parent 592e3ae608
2 changed files with 47 additions and 35 deletions
--- a/build-data/BuildData.groovy
+++ b/build-data/BuildData.groovy
@ -7,11 +7,11 @@ import org.tukaani.xz.*
 /* ------------------------------------------------------------------------- */


-def dir_root    = ".."
-def dir_website = "${dir_root}/website"
-def dir_data    = "${dir_website}/data"
+def dir_root    = ('..' as File).getCanonicalFile()
+def dir_website = dir_root.resolve('website')
+def dir_data    = dir_website.resolve('data')

-new File(dir_data).mkdirs()
+dir_data.mkdirs()

 // sort and check shared regex collections
 def dir_data_master = System.getProperty('net.filebot.data.master', 'https://raw.githubusercontent.com/filebot/data/master')
@ -23,7 +23,7 @@ def dir_data_master = System.getProperty('net.filebot.data.master', 'https://raw
 'series-mappings.txt'
 ].each{
 	def input = new URL(dir_data_master + '/' + it)
-	def output = new File("${dir_data}/${it}")
+	def output = dir_data.resolve(it)

 	log.fine "Fetch $input"
 	def lines = new TreeSet(String.CASE_INSENSITIVE_ORDER)
@ -40,7 +40,8 @@ def dir_data_master = System.getProperty('net.filebot.data.master', 'https://raw


 def reviews = []
-new File("${dir_root}/reviews.tsv").eachLine('UTF-8'){
+
+dir_root.resolve('reviews.tsv').eachLine('UTF-8'){
 	def s = it.split(/\t/, 3)*.trim()
 	reviews << [user: s[0], date: s[1], text: s[2]]
 }
@ -48,17 +49,17 @@ reviews = reviews.sort{ it.date }

 def json = new groovy.json.JsonBuilder()
 json.call(reviews as List)
-json.toPrettyString().saveAs("${dir_website}/reviews.json")
+json.toPrettyString().saveAs(dir_website.resolve('reviews.json'))
 log.info "Reviews: " + reviews.size()


 /* ------------------------------------------------------------------------- */


-def moviedb_out = new File("${dir_data}/moviedb.txt")
-def thetvdb_out = new File("${dir_data}/thetvdb.txt")
-def anidb_out   = new File("${dir_data}/anidb.txt")
-def osdb_out    = new File("${dir_data}/osdb.txt")
+def moviedb_out = dir_data.resolve('moviedb.txt')
+def thetvdb_out = dir_data.resolve('thetvdb.txt')
+def anidb_out   = dir_data.resolve('anidb.txt')
+def osdb_out    = dir_data.resolve('osdb.txt')


 def pack(file, lines) {
@ -157,7 +158,7 @@ def omdb = new TreeMap()
 }


-def tmdb_txt = new File('tmdb.txt')
+def tmdb_txt = 'tmdb.txt' as File
 def tmdb_index = csv(tmdb_txt, '\t', 1, [0..-1])

 def tmdb = []
@ -207,19 +208,19 @@ pack(moviedb_out, movies*.join('\t'))


 // BUILD tvdb index
-def tvdb_txt = new File('tvdb.txt')
+def tvdb_txt = 'tvdb.txt' as File
 def tvdb = [:]

 if (tvdb_txt.exists()) {
 	tvdb_txt.eachLine('UTF-8'){
-		def line = it.split('\t').toList()
-		def names = line.subList(5, line.size())
-		tvdb.put(line[1] as Integer, [line[0] as Long, line[1] as Integer, line[2], line[3] as Float, line[4] as Float] + names)
+		def line = it.split('\t') as List
+
+		tvdb.put(line[1] as Integer, [line[0] as Long, line[1] as Integer, line[2], line[3] as Float, line[4] as Float, line[5] as Integer] + line[6..<line.size()])
 	}
 }

 def tvdb_updates = [:] as TreeMap
-new File('updates_all.xml').eachLine('UTF-8'){
+('updates_all.xml' as File).eachLine('UTF-8'){
 	def m = (it =~ '<Series><id>(\\d+)</id><time>(\\d+)</time></Series>')
 	while(m.find()) {
 		def id = m.group(1) as Integer
@ -239,19 +240,19 @@ tvdb_updates.values().each{ update ->
 			retry(2, 60000) {
 				def seriesNames = []
 				def xml = new XmlSlurper().parse("http://thetvdb.com/api/BA864DEE427E384A/series/${update.id}/en.xml")
-				def imdbid = xml.Series.IMDB_ID.text()
+				def imdbid = any{ xml.Series.IMDB_ID.text().match(/tt\d+/) }{ '' }
+
 				seriesNames += xml.Series.SeriesName.text()

-				def rating = tryQuietly{ xml.Series.Rating.text().toFloat() }
-				def votes = tryQuietly{ xml.Series.RatingCount.text().toFloat() }
+				def rating = any{ xml.Series.Rating.text().toFloat() }{ 0 }
+				def votes = any{ xml.Series.RatingCount.text().toFloat() }{ 0 }
+				def year = any{ xml.Series.FirstAired.text().match(/\d{4}/) as Integer }{ 0 }

 				// only retrieve additional data for reasonably popular shows
-				if (votes >= 5 && rating >= 4) {
+				if (imdbid && votes >= 5 && rating >= 4) {
 					tryLogCatch{
-						if (imdbid =~ /tt(\d+)/) {
 						seriesNames += OMDb.getMovieDescriptor(new Movie(imdbid.match(/tt(\d+)/) as int), Locale.ENGLISH).getName()
 					}
-					}

 					// scrape extra alias titles from webpage (not supported yet by API)
 					def html = Cache.getCache('thetvdb_series_page', CacheType.Persistent).text(update.id) { 
@ -274,14 +275,14 @@ tvdb_updates.values().each{ update ->
 					seriesNames += intlseries
 				}

-				def data = [update.time, update.id, imdbid, rating ?: 0, votes ?: 0] + seriesNames.findAll{ it != null && it.length() > 0 }
+				def data = [update.time, update.id, imdbid, rating, votes, year] + seriesNames.findAll{ it != null && it.length() > 0 }
 				tvdb.put(update.id, data)
 				log.info "Update $update => $data"
 			}
 		}
 		catch(Throwable e) {
 			printException(e)
-			def data = [update.time, update.id, '', 0, 0]
+			def data = [update.time, update.id, '', 0, 0, 0]
 			tvdb.put(update.id, data)
 			log.info "[BAD] Update $update => $data"
 		}
@ -291,21 +292,27 @@ tvdb_updates.values().each{ update ->
 // remove entries that have become invalid
 tvdb.keySet().toList().each{ id ->
 	if (tvdb_updates[id] == null) {
-		log.info "Invalid ID found: ${tvdb[id]}"
+		log.finest "Invalid ID found: ${tvdb[id]}"
 		tvdb.remove(id)
 	}
 }
 tvdb.values().findResults{ it.collect{ it.toString().replace('\t', '').trim() }.join('\t') }.join('\n').saveAs(tvdb_txt)

 // additional custom mappings
-def extraAliasNames = csv("${dir_data}/add-series-alias.txt")
+def extraAliasNames = csv(dir_data.resolve('add-series-alias.txt'), '\t', 0, [1..-1])

 def thetvdb_index = []
 tvdb.values().each{ r ->
 	def tvdb_id = r[1]
 	def rating = r[3]
 	def votes = r[4]
-	def names = r.subList(5, r.size())
+	def year = r[5]
+	def names = r[6..<r.size()]
+
+	// ignore invalid entries
+	if (names.isEmpty()) {
+		return
+	}

 	def alias = extraAliasNames[names[0]]
 	if (alias) {
@ -313,7 +320,12 @@ tvdb.values().each{ r ->
 		names += alias
 	}

-	if (alias !=null || (votes >= 5 && rating >= 4) || (votes >= 2 && rating >= 6) || (votes >= 1 && rating >= 10)) {
+	if (year > 0 && !names[0].endsWith(" ($year)")) {
+		names.add(1, names[0] + " ($year)")
+	}
+
+	// always include if alias has been manually added
+	if (alias != null || (votes >= 5 && rating >= 4) || (votes >= 2 && rating >= 6) || (votes >= 1 && rating >= 10)) {
 		getNamePermutations(names).each{ n ->
 			thetvdb_index << [tvdb_id, n]
 		}
@ -337,7 +349,7 @@ pack(thetvdb_out, thetvdb_txt)
 // BUILD osdb index
 def osdb = []

-new File('osdb.txt').eachLine('UTF-8'){
+('osdb.txt' as File).eachLine('UTF-8'){
 	def fields = it.split(/\t/)*.trim()

 	// 0 IDMovie, 1 IDMovieImdb, 2 MovieName, 3 MovieYear, 4 MovieKind, 5 MoviePriority
@ -389,7 +401,7 @@ def anidb = new AnidbClient('filebot', 6).getAnimeTitles() as List
 def animeExcludes = [] as Set

 // exclude anime movies from anime index
-new File('anime-list.xml').eachLine('UTF-8') {
+('anime-list.xml' as File).eachLine('UTF-8') {
    if (it =~ /tvdbid="movie"/ || it =~ /imdbid="ttd\+"/) {
        animeExcludes << it.match(/anidbid="(\d+)"/).toInteger()
    }
--- a/source/net/filebot/format/ExpressionFormatFunctions.java
+++ b/source/net/filebot/format/ExpressionFormatFunctions.java
@ -80,14 +80,14 @@ public class ExpressionFormatFunctions {
 	}

 	public static List<String> readLines(String path) throws IOException {
-		return FileUtilities.readLines(getFile(path));
+		return FileUtilities.readLines(getUserFile(path));
 	}

 	public static Object readXml(String path) throws Exception {
-		return new XmlSlurper().parse(getFile(path));
+		return new XmlSlurper().parse(getUserFile(path));
 	}

-	public static File getFile(String path) {
+	public static File getUserFile(String path) {
 		File f = new File(path);

 		if (!f.isAbsolute()) {