2012-07-13 07:41:50 -04:00
// filebot -script BuildData.groovy
2012-02-24 08:39:32 -05:00
2012-07-13 07:41:50 -04:00
def sortRegexList ( path ) {
def set = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
new File ( path ) . eachLine {
// check if regex compiles
set + = java . util . regex . Pattern . compile ( it ) . pattern ( )
}
def out = set . join ( '\n' ) . saveAs ( path )
println "$out\n$out.text\n"
}
// sort and check shared regex collections
sortRegexList ( "website/data/release-groups.txt" )
sortRegexList ( "website/data/query-blacklist.txt" )
// ------------------------------------------------------------------------- //
def series_out = new File ( "website/data/series.list.gz" )
def movies_out = new File ( "website/data/movies.txt.gz" )
2012-02-26 12:02:54 -05:00
def gz ( file , lines ) {
file . withOutputStream { out - >
new java . util . zip . GZIPOutputStream ( out ) . withWriter ( 'utf-8' ) { writer - >
lines . each { writer . append ( it ) . append ( '\n' ) }
}
}
}
// ------------------------------------------------------------------------- //
2012-07-13 07:41:50 -04:00
// LOAD osdb-imdb.txt (already verified data)
def imdb_tsv = new File ( "website/data/osdb-imdb.txt" )
def imdb = [ ] . asSynchronized ( ) // thread-safe list
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
imdb_tsv . getText ( 'UTF-8' ) . eachLine {
imdb < < it . split ( /\t/ )
}
imdb_ids = new HashSet ( imdb . collect { it [ 0 ] } )
// BUILD movies.txt.gz
def osdb_tsv = new URL ( "http://www.opensubtitles.org/addons/export_movie.php" )
def osdb = [ ]
osdb_tsv . getText ( 'UTF-8' ) . eachLine {
2012-02-26 12:02:54 -05:00
def line = it . split ( /\t/ ) * . replaceAll ( /\s+/ , ' ' ) * . trim ( )
if ( line . size ( ) = = 4 & & line [ 0 ] = ~ /\d+/ ) {
2012-07-13 07:41:50 -04:00
osdb < < [ line [ 1 ] . toInteger ( ) , line [ 2 ] , line [ 3 ] . toInteger ( ) ]
2012-02-26 12:02:54 -05:00
}
}
2012-07-13 07:41:50 -04:00
osdb = osdb . findAll { it [ 0 ] < = 9999999 & & it [ 2 ] > = 1930 & & it [ 1 ] = ~ /^[A-Z0-9]/ & & it [ 1 ] = ~ /[\p{Alpha}]{3}/ } . collect { [ it [ 0 ] . pad ( 7 ) , it [ 1 ] , it [ 2 ] ] }
parallel ( osdb . collect { row - >
return {
// update new data
if ( ! imdb_ids . contains ( row [ 0 ] ) ) {
def mov = net . sourceforge . filebot . WebServices . IMDb . getMovieDescriptor ( row [ 0 ] as int , null )
if ( mov ! = null & & mov . name . length ( ) > 0 & & mov . year > 0 ) {
println "Adding $mov"
imdb < < [ row [ 0 ] , mov . name , mov . year ]
} else {
println "Blacklisting $row"
imdb < < [ row [ 0 ] , null ]
}
}
}
} , 20 )
// save updated imdb data
imdb . collect { it . join ( '\t' ) } . join ( '\n' ) . saveAs ( imdb_tsv )
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
// save movie data
def movies = imdb . findAll { it . size ( ) > = 3 & & ! it [ 1 ] . startsWith ( '"' ) }
2012-02-26 12:02:54 -05:00
2012-07-13 07:41:50 -04:00
def movieSorter = new TreeMap ( String . CASE_INSENSITIVE_ORDER )
movies . each { movieSorter . put ( it [ 1 ] , it ) }
movies = movieSorter . values ( ) . collect { it . join ( '\t' ) }
gz ( movies_out , movies )
2012-02-26 12:02:54 -05:00
println "Movie Count: " + movies . size ( )
// ------------------------------------------------------------------------- //
// BUILD series.list.gz
2012-07-13 07:41:50 -04:00
def thetvdb_index = new URL ( 'http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search' )
def thetvdb_names = thetvdb_index . fetch ( ) . getHtml ( 'UTF-8' )
2012-02-23 13:48:35 -05:00
. depthFirst ( ) . TABLE . find { it [ '@id' ] = = "listtable" }
. depthFirst ( ) . TR . findAll { it . TD . size ( ) = = 3 & & it . TD [ 1 ] . text ( ) = = 'English' }
. findResults { it . TD [ 0 ] . A . text ( ) }
2012-02-27 04:56:15 -05:00
2012-07-13 07:41:50 -04:00
def imdb_series_names = imdb . findAll { it . size ( ) > = 3 & & it [ 1 ] . startsWith ( '"' ) } . collect { it [ 1 ] }
def anidb_names = net . sourceforge . filebot . WebServices . AniDB . getAnimeTitles ( ) . findResults { [ it . getPrimaryTitle ( ) , it . getOfficialTitle ( 'en' ) ] } . flatten ( )
/ *
def dokuwiki_index = new URL ( 'http://docuwiki.net/postbot/getList.php?subject=Name' )
def doku_names = [ ]
dokuwiki_index . getText ( 'UTF-8' ) . eachLine {
doku_names < < it . trim ( ) . replaceTrailingBrackets ( )
}
* /
2012-02-23 13:48:35 -05:00
2012-07-13 07:41:50 -04:00
def names = [ thetvdb_names , imdb_series_names , anidb_names ]
names . each { if ( it . size ( ) = = 0 ) throw new Exception ( "Failed to scrape series names" ) } // sanity check
names = names . flatten ( ) . findAll { it = ~ /^[A-Z0-9]/ & & it = ~ /[\p{Alpha}]{3}/ } . findResults { net . sourceforge . filebot . similarity . Normalization . normalizePunctuation ( it ) } // collect and normalize names
2012-03-20 14:18:34 -04:00
2012-07-13 07:41:50 -04:00
def seriesSorter = new TreeSet ( String . CASE_INSENSITIVE_ORDER )
seriesSorter . addAll ( names )
names = seriesSorter as List
2012-02-23 13:48:35 -05:00
2012-07-13 07:41:50 -04:00
gz ( series_out , names )
2012-02-23 13:48:35 -05:00
println "Series Count: " + names . size ( )