From caafbca373ea108c00adaed54fd0f72f7ca56908 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 14 Oct 2012 11:57:25 +0000 Subject: [PATCH] + TheTVDB: extend API search with LocalSearch from cached database index --- BuildData.groovy | 25 +++-- .../net/sourceforge/filebot/WebServices.java | 98 ++++++++++++++++++- .../filebot/media/ReleaseInfo.java | 38 ++++++- .../filebot/media/ReleaseInfo.properties | 3 + .../sourceforge/filebot/web/LocalSearch.java | 8 +- website/data/.htaccess | 1 + website/data/series-mappings.txt | 6 +- 7 files changed, 159 insertions(+), 20 deletions(-) diff --git a/BuildData.groovy b/BuildData.groovy index d3f2215c..a427c102 100644 --- a/BuildData.groovy +++ b/BuildData.groovy @@ -44,8 +44,9 @@ sortRegexList("website/data/series-mappings.txt") // ------------------------------------------------------------------------- // -def series_out = new File("website/data/series.list.gz") -def movies_out = new File("website/data/movies.txt.gz") +def series_out = new File("website/data/series.list.gz") +def movies_out = new File("website/data/movies.txt.gz") +def thetvdb_out = new File("website/data/thetvdb.txt.gz") def gz(file, lines) { file.withOutputStream{ out -> @@ -114,15 +115,23 @@ println "Movie Count: " + movies.size() // ------------------------------------------------------------------------- // +// BUILD thetvdb-index.gz +def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') +def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8') +.depthFirst().TABLE.find{it['@id'] == "listtable"} +.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() } +.findResults{ [it.TD[2].text(), it.TD[0].A.text()] } + +// join and sort +def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort() +gz(thetvdb_out, thetvdb) +println "TheTVDB Index: " + thetvdb.size() + // BUILD series.list.gz // TheTVDB -def thetvdb_index = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search') -def thetvdb_names = thetvdb_index.fetch().getHtml('UTF-8') -.depthFirst().TABLE.find{it['@id'] == "listtable"} -.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'} -.findResults{ it.TD[0].A.text() } +def thetvdb_names = thetvdb_index.findResults{ it[1] } // AniDB def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten() @@ -141,7 +150,7 @@ dokuwiki_index.getText('UTF-8').eachLine{ def names = [thetvdb_names, anidb_names] names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check -names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names +names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it).toLowerCase() } // collect and normalize names def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER) seriesSorter.addAll(names) diff --git a/source/net/sourceforge/filebot/WebServices.java b/source/net/sourceforge/filebot/WebServices.java index f58e185a..a487891c 100644 --- a/source/net/sourceforge/filebot/WebServices.java +++ b/source/net/sourceforge/filebot/WebServices.java @@ -2,14 +2,33 @@ package net.sourceforge.filebot; +import static java.util.Arrays.*; +import static java.util.Collections.*; import static net.sourceforge.filebot.Settings.*; +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.logging.Level; +import java.util.logging.Logger; + +import net.sourceforge.filebot.media.MediaDetection; import net.sourceforge.filebot.web.AnidbClient; import net.sourceforge.filebot.web.EpisodeListProvider; import net.sourceforge.filebot.web.FanartTV; import net.sourceforge.filebot.web.IMDbClient; +import net.sourceforge.filebot.web.LocalSearch; import net.sourceforge.filebot.web.MovieIdentificationService; import net.sourceforge.filebot.web.OpenSubtitlesClient; +import net.sourceforge.filebot.web.SearchResult; import net.sourceforge.filebot.web.SerienjunkiesClient; import net.sourceforge.filebot.web.SublightSubtitleClient; import net.sourceforge.filebot.web.SubsceneSubtitleClient; @@ -28,9 +47,11 @@ public final class WebServices { // episode dbs public static final TVRageClient TVRage = new TVRageClient(); public static final AnidbClient AniDB = new AnidbClient(getApplicationName().toLowerCase(), 3); - public static final TheTVDBClient TheTVDB = new TheTVDBClient(getApplicationProperty("thetvdb.apikey")); public static final SerienjunkiesClient Serienjunkies = new SerienjunkiesClient(getApplicationProperty("serienjunkies.apikey")); + // extended TheTVDB module with local search + public static final TheTVDBClient TheTVDB = new TheTVDBClientWithLocalSearch(getApplicationProperty("thetvdb.apikey")); + // movie dbs public static final IMDbClient IMDb = new IMDbClient(); public static final TMDbClient TMDb = new TMDbClient(getApplicationProperty("themoviedb.apikey")); @@ -84,6 +105,81 @@ public final class WebServices { } + private static class TheTVDBClientWithLocalSearch extends TheTVDBClient { + + public TheTVDBClientWithLocalSearch(String apikey) { + super(apikey); + } + + // index of local thetvdb data dump + private static LocalSearch localIndex; + + + private synchronized LocalSearch getLocalIndex() throws IOException { + if (localIndex == null) { + // fetch data dump + TheTVDBSearchResult[] data = MediaDetection.releaseInfo.getTheTVDBIndex(); + + // index data dump + localIndex = new LocalSearch(asList(data)) { + + @Override + protected Set getFields(SearchResult object) { + return set(object.getName()); + } + }; + } + + return localIndex; + } + + + @SuppressWarnings("unchecked") + @Override + public List fetchSearchResult(final String query, final Locale locale) throws Exception { + Callable> apiSearch = new Callable>() { + + @Override + public List call() throws Exception { + return TheTVDBClientWithLocalSearch.super.fetchSearchResult(query, locale); + } + }; + Callable> localSearch = new Callable>() { + + @Override + public List call() throws Exception { + try { + return getLocalIndex().search(query); + } catch (Exception e) { + Logger.getLogger(TheTVDBClientWithLocalSearch.class.getName()).log(Level.SEVERE, e.getMessage(), e); + } + + // let local search fail gracefully without affecting API search + return emptyList(); + } + }; + + ExecutorService executor = Executors.newFixedThreadPool(2); + try { + Set results = new LinkedHashSet(); + + for (Future> resultSet : executor.invokeAll(asList(localSearch, apiSearch))) { + try { + results.addAll(resultSet.get()); + } catch (ExecutionException e) { + if (e.getCause() instanceof Exception) { + throw (Exception) e.getCause(); // unwrap cause + } + } + } + return new ArrayList(results); + } finally { + executor.shutdownNow(); + } + }; + } + + /** * Dummy constructor to prevent instantiation. */ diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.java b/source/net/sourceforge/filebot/media/ReleaseInfo.java index 7f32703b..a3ba60fb 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.java +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java @@ -37,6 +37,7 @@ import java.util.zip.GZIPInputStream; import net.sourceforge.filebot.web.CachedResource; import net.sourceforge.filebot.web.Movie; +import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult; import net.sourceforge.tuned.ByteBufferInputStream; @@ -242,6 +243,11 @@ public class ReleaseInfo { } + public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException { + return theTVDBIndexResource.get(); + } + + public Map getSeriesDirectMappings() throws IOException { Map mappings = new LinkedHashMap(); for (String line : seriesDirectMappingsResource.get()) { @@ -268,8 +274,9 @@ public class ReleaseInfo { protected final CachedResource queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist")); protected final CachedResource excludeBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.exclude-blacklist")); protected final CachedResource movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list")); - protected final CachedResource seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list")); + protected final CachedResource seriesListResource = new SeriesListResource(getBundle(getClass().getName()).getString("url.series-list")); protected final CachedResource seriesDirectMappingsResource = new PatternResource(getBundle(getClass().getName()).getString("url.series-mappings")); + protected final CachedResource theTVDBIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index")); protected static class PatternResource extends CachedResource { @@ -310,16 +317,39 @@ public class ReleaseInfo { } - protected static class SeriesResource extends CachedResource { + protected static class SeriesListResource extends CachedResource { - public SeriesResource(String resource) { + public SeriesListResource(String resource) { super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week } @Override public String[] process(ByteBuffer data) throws IOException { - return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n"); + return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8")).split("\\n"); + } + } + + + protected static class TheTVDBIndexResource extends CachedResource { + + public TheTVDBIndexResource(String resource) { + super(resource, TheTVDBSearchResult[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week + } + + + @Override + public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException { + Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n"); + + List tvshows = new ArrayList(); + while (scanner.hasNext()) { + int id = scanner.nextInt(); + String name = scanner.next().trim(); + tvshows.add(new TheTVDBSearchResult(name, id)); + } + + return tvshows.toArray(new TheTVDBSearchResult[0]); } } diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.properties b/source/net/sourceforge/filebot/media/ReleaseInfo.properties index 91bf9f79..ac573496 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.properties +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.properties @@ -22,5 +22,8 @@ url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz # list of tv show and anime names url.series-list: http://filebot.sourceforge.net/data/series.list.gz +# TheTVDB index +url.thetvdb-index: http://filebot.sourceforge.net/data/thetvdb.txt.gz + # disk folder matcher pattern.diskfolder.entry: BDMV|HVDVD_TS|VIDEO_TS|AUDIO_TS|VCD|movie.nfo diff --git a/source/net/sourceforge/filebot/web/LocalSearch.java b/source/net/sourceforge/filebot/web/LocalSearch.java index f89fecb9..3950d97a 100644 --- a/source/net/sourceforge/filebot/web/LocalSearch.java +++ b/source/net/sourceforge/filebot/web/LocalSearch.java @@ -24,7 +24,7 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; -class LocalSearch { +public class LocalSearch { private final AbstractStringMetric metric = new QGramsDistance(); private final float resultMinimumSimilarity = 0.5f; @@ -75,9 +75,13 @@ class LocalSearch { if (entry.get() != null) { resultSet.add(entry.get()); } + + if (Thread.interrupted()) { + throw new InterruptedException(); + } } } finally { - executor.shutdown(); + executor.shutdownNow(); } // sort by similarity descending (best matches first) diff --git a/website/data/.htaccess b/website/data/.htaccess index 88b32a5e..57aa4bc8 100644 --- a/website/data/.htaccess +++ b/website/data/.htaccess @@ -2,3 +2,4 @@ options +indexes redirect 301 /data/movies.txt.gz http://sourceforge.net/projects/filebot/files/data/movies.txt.gz/download redirect 301 /data/series.list.gz http://sourceforge.net/projects/filebot/files/data/series.list.gz/download +redirect 301 /data/thetvdb.txt.gz http://sourceforge.net/projects/filebot/files/data/thetvdb.txt.gz/download diff --git a/website/data/series-mappings.txt b/website/data/series-mappings.txt index ab9402dc..0618b4b8 100644 --- a/website/data/series-mappings.txt +++ b/website/data/series-mappings.txt @@ -1,5 +1 @@ -Dark.Matters.Twisted.But.True Dark Matters -Franklin.and.Bash Franklin & Bash -HIMYM How I Met your Mother -Rizolli.and.Isles Rizzoli & Isles -Rizzoli.and.Isles Rizzoli & Isles \ No newline at end of file +HIMYM How I Met your Mother \ No newline at end of file