2012-07-13 07:41:50 -04:00
// filebot -script BuildData.groovy
2012-02-24 08:39:32 -05:00
2012-07-13 07:41:50 -04:00
2012-08-10 21:20:19 -04:00
// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php
2012-12-14 10:23:38 -05:00
@Grab ( group = 'org.jsoup' , module = 'jsoup' , version = '1.7.1' )
2012-08-10 21:20:19 -04:00
import org.jsoup.*
2013-03-06 03:34:43 -05:00
/ *
2012-08-10 21:20:19 -04:00
def sgl = [ ]
for ( def page = 0 ; true ; page + + ) {
def dom = Jsoup . parse ( new URL ( 'http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page ) , 10000 )
def table = dom . select ( "table[border=1] tr" ) . collect { it . select ( "td" ) * . text ( ) * . trim ( ) } . findAll { it [ 2 ] = ~ /\d{4}/ }
sgl + = table
if ( table . empty ) break
}
sgl = sgl . findAll { it [ 1 ] = ~ /\b(DVD|DVDR|HD|TV)\b/ & & it [ 0 ] = ~ /\p{Alpha}/ } . findResults { it [ 0 ] }
sgl = sgl . collect { it . before ( / - / ) . trim ( ) . space ( '.' ) } . collect { it = = ~ /\p{Upper}?\p{Lower}+/ ? it . toUpperCase ( ) : it }
// append release group names
new File ( 'website/data/release-groups.txt' ) < < '\n' < < sgl . join ( '\n' )
2013-03-06 03:34:43 -05:00
* /
2012-08-10 21:20:19 -04:00
// ------------------------------------------------------------------------- //
2012-07-13 07:41:50 -04:00
def sortRegexList ( path ) {
def set = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
2012-07-15 22:36:49 -04:00
new File ( path ) . eachLine ( 'UTF-8' ) {
2012-07-13 07:41:50 -04:00
// check if regex compiles
set + = java . util . regex . Pattern . compile ( it ) . pattern ( )
}
def out = set . join ( '\n' ) . saveAs ( path )
println "$out\n$out.text\n"
}
// sort and check shared regex collections
sortRegexList ( "website/data/release-groups.txt" )
sortRegexList ( "website/data/query-blacklist.txt" )
2012-07-26 04:45:15 -04:00
sortRegexList ( "website/data/exclude-blacklist.txt" )
2012-10-09 11:04:14 -04:00
sortRegexList ( "website/data/series-mappings.txt" )
2012-07-13 07:41:50 -04:00
// ------------------------------------------------------------------------- //
2012-10-14 07:57:25 -04:00
def series_out = new File ( "website/data/series.list.gz" )
def movies_out = new File ( "website/data/movies.txt.gz" )
def thetvdb_out = new File ( "website/data/thetvdb.txt.gz" )
2012-02-26 12:02:54 -05:00
def gz ( file , lines ) {
file . withOutputStream { out - >
2012-07-15 22:36:49 -04:00
new java . util . zip . GZIPOutputStream ( out ) . withWriter ( 'UTF-8' ) { writer - >
2012-02-26 12:02:54 -05:00
lines . each { writer . append ( it ) . append ( '\n' ) }
}
}
}
// ------------------------------------------------------------------------- //
2012-07-13 07:41:50 -04:00
// BUILD movies.txt.gz
2013-01-13 08:16:06 -05:00
def omdb = new TreeSet ( { a , b - > a [ 0 ] . compareTo ( b [ 0 ] ) } as Comparator )
new File ( 'omdb.txt' ) . eachLine ( 'Windows-1252' ) {
2012-12-05 04:36:30 -05:00
def line = it . split ( /\t/ )
2013-01-15 06:22:46 -05:00
if ( line . length > 11 & & line [ 0 ] = = ~ /\d+/ ) {
def imdbid = line [ 1 ] . substring ( 2 ) . toInteger ( )
def name = line [ 2 ]
def year = line [ 3 ] . toInteger ( )
def runtime = line [ 5 ]
def rating = tryQuietly { line [ 11 ] . toFloat ( ) } ? : 0
def votes = tryQuietly { line [ 12 ] . replaceAll ( /\D/ , '' ) . toInteger ( ) } ? : 0
if ( ( year > = 1970 & & runtime = ~ /h/ & & rating > = 1 & & votes > = 50 ) | | ( votes > = 2000 ) ) {
line = line * . replaceAll ( /\s+/ , ' ' ) * . trim ( )
omdb < < [ imdbid , name , year ]
}
2012-12-05 04:36:30 -05:00
}
}
2013-01-15 06:22:46 -05:00
omdb = omdb . findAll { it [ 0 ] < = 9999999 & & it [ 1 ] = ~ /^[A-Z0-9]/ & & it [ 1 ] = ~ /[\p{Alpha}]{3}/ } . collect { [ it [ 0 ] . pad ( 7 ) , it [ 1 ] , it [ 2 ] ] }
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
// save movie data
2013-01-13 08:16:06 -05:00
def movies = omdb . findAll { it . size ( ) > = 3 & & ! it [ 1 ] . startsWith ( '"' ) }
2012-07-13 07:41:50 -04:00
def movieSorter = new TreeMap ( String . CASE_INSENSITIVE_ORDER )
2012-07-20 07:45:18 -04:00
movies . each { movieSorter . put ( [ it [ 1 ] , it [ 2 ] , it [ 0 ] ] . join ( '\t' ) , it ) }
2012-07-13 07:41:50 -04:00
movies = movieSorter . values ( ) . collect { it . join ( '\t' ) }
gz ( movies_out , movies )
2012-02-26 12:02:54 -05:00
println "Movie Count: " + movies . size ( )
// ------------------------------------------------------------------------- //
2012-10-14 07:57:25 -04:00
// BUILD thetvdb-index.gz
def thetvdb_index_url = new URL ( 'http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search' )
def thetvdb_index = thetvdb_index_url . fetch ( ) . getHtml ( 'UTF-8' )
. depthFirst ( ) . TABLE . find { it [ '@id' ] = = "listtable" }
. depthFirst ( ) . TR . findAll { it . TD . size ( ) = = 3 & & it . TD [ 1 ] . text ( ) = = 'English' & & it . TD [ 0 ] . A . text ( ) }
. findResults { [ it . TD [ 2 ] . text ( ) , it . TD [ 0 ] . A . text ( ) ] }
2012-12-03 13:08:02 -05:00
thetvdb_index = thetvdb_index . findResults { [ it [ 0 ] as Integer , it [ 1 ] . replaceAll ( /\s+/ , ' ' ) . trim ( ) ] } . findAll { ! ( it [ 1 ] = ~ /(?i:duplicate)/ | | it [ 1 ] = ~ /\d{6,}/ | | it [ 1 ] . startsWith ( '*' ) | | it [ 1 ] . endsWith ( '*' ) | | it [ 1 ] . empty ) } as HashSet
2012-10-14 07:57:25 -04:00
// join and sort
def thetvdb = thetvdb_index . findResults { [ it [ 0 ] . pad ( 6 ) , it [ 1 ] ] . join ( '\t' ) } . sort ( )
gz ( thetvdb_out , thetvdb )
println "TheTVDB Index: " + thetvdb . size ( )
2012-02-26 12:02:54 -05:00
// BUILD series.list.gz
2012-07-15 22:36:49 -04:00
// TheTVDB
2012-10-14 07:57:25 -04:00
def thetvdb_names = thetvdb_index . findResults { it [ 1 ] }
2012-02-23 13:48:35 -05:00
2012-07-15 22:36:49 -04:00
// AniDB
2012-07-13 07:41:50 -04:00
def anidb_names = net . sourceforge . filebot . WebServices . AniDB . getAnimeTitles ( ) . findResults { [ it . getPrimaryTitle ( ) , it . getOfficialTitle ( 'en' ) ] } . flatten ( )
/ *
2012-07-15 22:36:49 -04:00
// IMDb series list
def imdb_series_names = imdb . findAll { it . size ( ) > = 3 & & it [ 1 ] . startsWith ( '"' ) } . collect { it [ 1 ] }
// Dokuwiki list
2012-07-13 07:41:50 -04:00
def dokuwiki_index = new URL ( 'http://docuwiki.net/postbot/getList.php?subject=Name' )
def doku_names = [ ]
dokuwiki_index . getText ( 'UTF-8' ) . eachLine {
doku_names < < it . trim ( ) . replaceTrailingBrackets ( )
}
* /
2012-02-23 13:48:35 -05:00
2012-07-15 22:36:49 -04:00
def names = [ thetvdb_names , anidb_names ]
2012-07-13 07:41:50 -04:00
names . each { if ( it . size ( ) = = 0 ) throw new Exception ( "Failed to scrape series names" ) } // sanity check
2012-10-15 08:10:36 -04:00
names = names . flatten ( ) . findAll { it = ~ /^[A-Z0-9]/ & & it = ~ /[\p{Alpha}]{3}/ } . findResults { net . sourceforge . filebot . similarity . Normalization . normalizePunctuation ( it ) } // collect and normalize names
2012-03-20 14:18:34 -04:00
2012-07-13 07:41:50 -04:00
def seriesSorter = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
seriesSorter . addAll ( names )
names = seriesSorter as List
2012-02-23 13:48:35 -05:00
2012-07-13 07:41:50 -04:00
gz ( series_out , names )
2012-02-23 13:48:35 -05:00
println "Series Count: " + names . size ( )
2012-12-14 10:23:38 -05:00
// prepare reviews from SF.net for website
def reviewPage = retry ( 10 , 1000 ) { Jsoup . connect ( 'https://sourceforge.net/projects/filebot/reviews/?sort=usefulness&filter=thumbs_up' ) . get ( ) }
def reviews = reviewPage . select ( 'article[itemtype~=Review]' ) . findResults { article - >
article . select ( '*[itemprop=reviewBody]' ) . findAll { ! ( it . attr ( 'class' ) = ~ /spam/ ) } . findResults { review - >
2013-02-13 00:58:13 -05:00
[ user: article . select ( '*[itemprop=name]' ) . text ( ) , date: article . select ( '*[datetime]' ) . text ( ) , text: review . text ( ) ]
2012-12-14 10:23:38 -05:00
}
} . flatten ( )
2013-02-13 00:58:13 -05:00
reviews = reviews . findAll { it . user & & ! ( it . date = ~ /[a-z]/ ) }
2012-12-14 10:23:38 -05:00
use ( groovy . json . JsonOutput ) {
println "Reviews: ${reviews.size()}"
reviews . toJson ( ) . prettyPrint ( ) . saveAs ( 'website/reviews.json' )
}