From 7226dd6fc4f9eedd3a98af5cb52e13ccce1e7d23 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Tue, 12 Mar 2013 13:43:35 +0000 Subject: [PATCH] * misc --- BuildData.groovy | 160 ----------------------- source/net/sourceforge/filebot/Main.java | 2 +- website/data/release-groups.txt | 2 + website/data/series-mappings.txt | 1 + website/index.html | 4 +- 5 files changed, 6 insertions(+), 163 deletions(-) delete mode 100644 BuildData.groovy diff --git a/BuildData.groovy b/BuildData.groovy deleted file mode 100644 index 52427700..00000000 --- a/BuildData.groovy +++ /dev/null @@ -1,160 +0,0 @@ -// filebot -script BuildData.groovy - - -// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php -@Grab(group='org.jsoup', module='jsoup', version='1.7.1') -import org.jsoup.* - -/* -def sgl = [] -for (def page = 0; true; page++) { - def dom = Jsoup.parse(new URL('http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page), 10000) - def table = dom.select("table[border=1] tr").collect{ it.select("td")*.text()*.trim() }.findAll{ it[2] =~ /\d{4}/ } - sgl += table - if (table.empty) break -} -sgl = sgl.findAll{ it[1] =~ /\b(DVD|DVDR|HD|TV)\b/ && it[0] =~ /\p{Alpha}/}.findResults{ it[0] } -sgl = sgl.collect{ it.before(/ - /).trim().space('.') }.collect{ it ==~ /\p{Upper}?\p{Lower}+/ ? it.toUpperCase() : it } - -// append release group names -new File('website/data/release-groups.txt') << '\n' << sgl.join('\n') -*/ - - -// ------------------------------------------------------------------------- // - - -def sortRegexList(path) { - def set = new TreeSet(String.CASE_INSENSITIVE_ORDER) - new File(path).eachLine('UTF-8'){ - // check if regex compiles - set += java.util.regex.Pattern.compile(it).pattern() - } - - def out = set.join('\n').saveAs(path) - println "$out\n$out.text\n" -} - - -// sort and check shared regex collections -sortRegexList("website/data/release-groups.txt") -sortRegexList("website/data/query-blacklist.txt") -sortRegexList("website/data/exclude-blacklist.txt") -sortRegexList("website/data/series-mappings.txt") - - -// ------------------------------------------------------------------------- // - - -def series_out = new File("website/data/series.list.gz") -def movies_out = new File("website/data/movies.txt.gz") -def thetvdb_out = new File("website/data/thetvdb.txt.gz") - -def gz(file, lines) { - file.withOutputStream{ out -> - new java.util.zip.GZIPOutputStream(out).withWriter('UTF-8'){ writer -> - lines.each{ writer.append(it).append('\n') } - } - } -} - - -// ------------------------------------------------------------------------- // - -// BUILD movies.txt.gz -def omdb = new TreeSet({ a, b -> a[0].compareTo(b[0]) } as Comparator) -new File('omdb.txt').eachLine('Windows-1252'){ - def line = it.split(/\t/) - - if (line.length > 11 && line[0] ==~ /\d+/) { - def imdbid = line[1].substring(2).toInteger() - def name = line[2] - def year = line[3].toInteger() - def runtime = line[5] - def rating = tryQuietly{ line[11].toFloat() } ?: 0 - def votes = tryQuietly{ line[12].replaceAll(/\D/, '').toInteger() } ?: 0 - - if ((year >= 1970 && runtime =~ /h/ && rating >= 1 && votes >= 50) || (votes >= 2000)) { - line = line*.replaceAll(/\s+/, ' ')*.trim() - omdb << [imdbid, name, year] - } - } -} -omdb = omdb.findAll{ it[0] <= 9999999 && it[1] =~ /^[A-Z0-9]/ && it[1] =~ /[\p{Alpha}]{3}/ }.collect{ [it[0].pad(7), it[1], it[2]] } - -// save movie data -def movies = omdb.findAll{ it.size() >= 3 && !it[1].startsWith('"') } -def movieSorter = new TreeMap(String.CASE_INSENSITIVE_ORDER) -movies.each{ movieSorter.put([it[1], it[2], it[0]].join('\t'), it) } -movies = movieSorter.values().collect{ it.join('\t') } - -gz(movies_out, movies) -println "Movie Count: " + movies.size() - - -// ------------------------------------------------------------------------- // - -// BUILD thetvdb-index.gz -def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') -def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8') -.depthFirst().TABLE.find{it['@id'] == "listtable"} -.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() } -.findResults{ [it.TD[2].text(), it.TD[0].A.text()] } - -thetvdb_index = thetvdb_index.findResults{ [it[0] as Integer, it[1].replaceAll(/\s+/, ' ').trim()] }.findAll{ !(it[1] =~ /(?i:duplicate)/ || it[1] =~ /\d{6,}/ || it[1].startsWith('*') || it[1].endsWith('*') || it[1].empty) } as HashSet - -// join and sort -def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort() -gz(thetvdb_out, thetvdb) -println "TheTVDB Index: " + thetvdb.size() - - -// BUILD series.list.gz - -// TheTVDB -def thetvdb_names = thetvdb_index.findResults{ it[1] } - -// AniDB -def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten() - -/* -// IMDb series list -def imdb_series_names = imdb.findAll{ it.size() >= 3 && it[1].startsWith('"') }.collect{ it[1] } - -// Dokuwiki list -def dokuwiki_index = new URL('http://docuwiki.net/postbot/getList.php?subject=Name') -def doku_names = [] -dokuwiki_index.getText('UTF-8').eachLine{ - doku_names << it.trim().replaceTrailingBrackets() -} -*/ - -def names = [thetvdb_names, anidb_names] -names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check -names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names - -def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER) -seriesSorter.addAll(names) -names = seriesSorter as List - - -gz(series_out, names) -println "Series Count: " + names.size() - - - - -// prepare reviews from SF.net for website -def reviewPage = retry(10, 1000){ Jsoup.connect('https://sourceforge.net/projects/filebot/reviews/?sort=usefulness&filter=thumbs_up').get() } -def reviews = reviewPage.select('article[itemtype~=Review]').findResults{ article -> - article.select('*[itemprop=reviewBody]').findAll{ !(it.attr('class') =~ /spam/) }.findResults{ review -> - [user:article.select('*[itemprop=name]').text(), date:article.select('*[datetime]').text(), text:review.text()] - } -}.flatten() - -reviews = reviews.findAll{ it.user && !(it.date =~ /[a-z]/) } - -use (groovy.json.JsonOutput) { - println "Reviews: ${reviews.size()}" - reviews.toJson().prettyPrint().saveAs('website/reviews.json') -} diff --git a/source/net/sourceforge/filebot/Main.java b/source/net/sourceforge/filebot/Main.java index 4334c439..a95681a9 100644 --- a/source/net/sourceforge/filebot/Main.java +++ b/source/net/sourceforge/filebot/Main.java @@ -222,7 +222,7 @@ public class Main { try { PreferencesEntry persistentDonateLv = Settings.forPackage(Main.class).entry("donate.lv").defaultValue("0"); int donateLv = Integer.parseInt(persistentDonateLv.getValue()); - int donateStep = 10000; + int donateStep = 1000; int usage = history.totalSize(); if (usage / donateStep > donateLv || new Random().nextInt(100) == 42) { diff --git a/website/data/release-groups.txt b/website/data/release-groups.txt index 10d04505..8a55ead3 100644 --- a/website/data/release-groups.txt +++ b/website/data/release-groups.txt @@ -224,6 +224,7 @@ BRMP BRUTUS BRZONE BS +BTN BTSD BTSFilms BTT @@ -331,6 +332,7 @@ CULTHD CuMBuCKeTS CYBERMEN CyberTyger +Cyphanix D-Z0N3 D0PE D2V diff --git a/website/data/series-mappings.txt b/website/data/series-mappings.txt index 1bb00a83..faf118d9 100644 --- a/website/data/series-mappings.txt +++ b/website/data/series-mappings.txt @@ -1,3 +1,4 @@ HIMYM How I Met your Mother Hml8p Homeland +hoc House of Cards NCIS.LA NCIS: Los Angeles \ No newline at end of file diff --git a/website/index.html b/website/index.html index b2cffc29..1d150d3a 100644 --- a/website/index.html +++ b/website/index.html @@ -153,7 +153,7 @@