2012-07-13 07:41:50 -04:00
// filebot -script BuildData.groovy
2012-02-24 08:39:32 -05:00
2012-07-13 07:41:50 -04:00
2012-08-10 21:20:19 -04:00
// UPDATE release-groups.txt FROM http://scenegrouplist.com/lists_sgl.php
@Grab ( 'org.jsoup:jsoup' )
import org.jsoup.*
def sgl = [ ]
for ( def page = 0 ; true ; page + + ) {
def dom = Jsoup . parse ( new URL ( 'http://scenegrouplist.com/lists_sgl.php?pageNum_RSSGL=' + page ) , 10000 )
def table = dom . select ( "table[border=1] tr" ) . collect { it . select ( "td" ) * . text ( ) * . trim ( ) } . findAll { it [ 2 ] = ~ /\d{4}/ }
sgl + = table
if ( table . empty ) break
}
sgl = sgl . findAll { it [ 1 ] = ~ /\b(DVD|DVDR|HD|TV)\b/ & & it [ 0 ] = ~ /\p{Alpha}/ } . findResults { it [ 0 ] }
sgl = sgl . collect { it . before ( / - / ) . trim ( ) . space ( '.' ) } . collect { it = = ~ /\p{Upper}?\p{Lower}+/ ? it . toUpperCase ( ) : it }
// append release group names
new File ( 'website/data/release-groups.txt' ) < < '\n' < < sgl . join ( '\n' )
// ------------------------------------------------------------------------- //
2012-07-13 07:41:50 -04:00
def sortRegexList ( path ) {
def set = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
2012-07-15 22:36:49 -04:00
new File ( path ) . eachLine ( 'UTF-8' ) {
2012-07-13 07:41:50 -04:00
// check if regex compiles
set + = java . util . regex . Pattern . compile ( it ) . pattern ( )
}
def out = set . join ( '\n' ) . saveAs ( path )
println "$out\n$out.text\n"
}
// sort and check shared regex collections
sortRegexList ( "website/data/release-groups.txt" )
sortRegexList ( "website/data/query-blacklist.txt" )
2012-07-26 04:45:15 -04:00
sortRegexList ( "website/data/exclude-blacklist.txt" )
2012-07-13 07:41:50 -04:00
// ------------------------------------------------------------------------- //
def series_out = new File ( "website/data/series.list.gz" )
def movies_out = new File ( "website/data/movies.txt.gz" )
2012-02-26 12:02:54 -05:00
def gz ( file , lines ) {
file . withOutputStream { out - >
2012-07-15 22:36:49 -04:00
new java . util . zip . GZIPOutputStream ( out ) . withWriter ( 'UTF-8' ) { writer - >
2012-02-26 12:02:54 -05:00
lines . each { writer . append ( it ) . append ( '\n' ) }
}
}
}
// ------------------------------------------------------------------------- //
2012-07-13 07:41:50 -04:00
// LOAD osdb-imdb.txt (already verified data)
def imdb_tsv = new File ( "website/data/osdb-imdb.txt" )
def imdb = [ ] . asSynchronized ( ) // thread-safe list
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
imdb_tsv . getText ( 'UTF-8' ) . eachLine {
imdb < < it . split ( /\t/ )
}
imdb_ids = new HashSet ( imdb . collect { it [ 0 ] } )
// BUILD movies.txt.gz
def osdb_tsv = new URL ( "http://www.opensubtitles.org/addons/export_movie.php" )
def osdb = [ ]
osdb_tsv . getText ( 'UTF-8' ) . eachLine {
2012-02-26 12:02:54 -05:00
def line = it . split ( /\t/ ) * . replaceAll ( /\s+/ , ' ' ) * . trim ( )
if ( line . size ( ) = = 4 & & line [ 0 ] = ~ /\d+/ ) {
2012-07-13 07:41:50 -04:00
osdb < < [ line [ 1 ] . toInteger ( ) , line [ 2 ] , line [ 3 ] . toInteger ( ) ]
2012-02-26 12:02:54 -05:00
}
}
2012-07-13 07:41:50 -04:00
osdb = osdb . findAll { it [ 0 ] < = 9999999 & & it [ 2 ] > = 1930 & & it [ 1 ] = ~ /^[A-Z0-9]/ & & it [ 1 ] = ~ /[\p{Alpha}]{3}/ } . collect { [ it [ 0 ] . pad ( 7 ) , it [ 1 ] , it [ 2 ] ] }
parallel ( osdb . collect { row - >
return {
// update new data
if ( ! imdb_ids . contains ( row [ 0 ] ) ) {
2012-07-20 07:45:18 -04:00
// get original title and english title
[ Locale . ROOT , Locale . ENGLISH ] . collect { locale - > net . sourceforge . filebot . WebServices . IMDb . getMovieDescriptor ( row [ 0 ] as int , locale ) } . unique { it as String } . each { mov - >
if ( mov ! = null & & mov . name . length ( ) > 0 & & mov . year > 0 ) {
println "Adding $mov"
imdb < < [ row [ 0 ] , mov . name , mov . year ]
} else {
println "Blacklisting $row"
imdb < < [ row [ 0 ] , null ]
}
2012-07-13 07:41:50 -04:00
}
}
}
} , 20 )
// save updated imdb data
2012-07-20 10:09:44 -04:00
imdb . collect { it . join ( '\t' ) } . sort ( ) . join ( '\n' ) . saveAs ( imdb_tsv )
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
// save movie data
def movies = imdb . findAll { it . size ( ) > = 3 & & ! it [ 1 ] . startsWith ( '"' ) }
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
def movieSorter = new TreeMap ( String . CASE_INSENSITIVE_ORDER )
2012-07-20 07:45:18 -04:00
movies . each { movieSorter . put ( [ it [ 1 ] , it [ 2 ] , it [ 0 ] ] . join ( '\t' ) , it ) }
2012-07-13 07:41:50 -04:00
movies = movieSorter . values ( ) . collect { it . join ( '\t' ) }
gz ( movies_out , movies )
2012-02-26 12:02:54 -05:00
println "Movie Count: " + movies . size ( )
// ------------------------------------------------------------------------- //
// BUILD series.list.gz
2012-07-15 22:36:49 -04:00
// TheTVDB
2012-07-13 07:41:50 -04:00
def thetvdb_index = new URL ( 'http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search' )
def thetvdb_names = thetvdb_index . fetch ( ) . getHtml ( 'UTF-8' )
2012-02-23 13:48:35 -05:00
. depthFirst ( ) . TABLE . find { it [ '@id' ] = = "listtable" }
. depthFirst ( ) . TR . findAll { it . TD . size ( ) = = 3 & & it . TD [ 1 ] . text ( ) = = 'English' }
. findResults { it . TD [ 0 ] . A . text ( ) }
2012-07-15 22:36:49 -04:00
// AniDB
2012-07-13 07:41:50 -04:00
def anidb_names = net . sourceforge . filebot . WebServices . AniDB . getAnimeTitles ( ) . findResults { [ it . getPrimaryTitle ( ) , it . getOfficialTitle ( 'en' ) ] } . flatten ( )
/ *
2012-07-15 22:36:49 -04:00
// IMDb series list
def imdb_series_names = imdb . findAll { it . size ( ) > = 3 & & it [ 1 ] . startsWith ( '"' ) } . collect { it [ 1 ] }
// Dokuwiki list
2012-07-13 07:41:50 -04:00
def dokuwiki_index = new URL ( 'http://docuwiki.net/postbot/getList.php?subject=Name' )
def doku_names = [ ]
dokuwiki_index . getText ( 'UTF-8' ) . eachLine {
doku_names < < it . trim ( ) . replaceTrailingBrackets ( )
}
* /
2012-02-23 13:48:35 -05:00
2012-07-15 22:36:49 -04:00
def names = [ thetvdb_names , anidb_names ]
2012-07-13 07:41:50 -04:00
names . each { if ( it . size ( ) = = 0 ) throw new Exception ( "Failed to scrape series names" ) } // sanity check
names = names . flatten ( ) . findAll { it = ~ /^[A-Z0-9]/ & & it = ~ /[\p{Alpha}]{3}/ } . findResults { net . sourceforge . filebot . similarity . Normalization . normalizePunctuation ( it ) } // collect and normalize names
2012-03-20 14:18:34 -04:00
2012-07-13 07:41:50 -04:00
def seriesSorter = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
seriesSorter . addAll ( names )
names = seriesSorter as List
2012-02-23 13:48:35 -05:00
2012-07-13 07:41:50 -04:00
gz ( series_out , names )
2012-02-23 13:48:35 -05:00
println "Series Count: " + names . size ( )