+ TheTVDB: extend API search with LocalSearch from cached database index

This commit is contained in:
Reinhard Pointner 2012-10-14 11:57:25 +00:00
parent ac958bd7d3
commit caafbca373
7 changed files with 159 additions and 20 deletions

View File

@ -44,8 +44,9 @@ sortRegexList("website/data/series-mappings.txt")
// ------------------------------------------------------------------------- //
def series_out = new File("website/data/series.list.gz")
def movies_out = new File("website/data/movies.txt.gz")
def series_out = new File("website/data/series.list.gz")
def movies_out = new File("website/data/movies.txt.gz")
def thetvdb_out = new File("website/data/thetvdb.txt.gz")
def gz(file, lines) {
file.withOutputStream{ out ->
@ -114,15 +115,23 @@ println "Movie Count: " + movies.size()
// ------------------------------------------------------------------------- //
// BUILD thetvdb-index.gz
def thetvdb_index_url = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def thetvdb_index = thetvdb_index_url.fetch().getHtml('UTF-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English' && it.TD[0].A.text() }
.findResults{ [it.TD[2].text(), it.TD[0].A.text()] }
// join and sort
def thetvdb = thetvdb_index.findResults{ [it[0].pad(6), it[1]].join('\t') }.sort()
gz(thetvdb_out, thetvdb)
println "TheTVDB Index: " + thetvdb.size()
// BUILD series.list.gz
// TheTVDB
def thetvdb_index = new URL('http://thetvdb.com/?string=&searchseriesid=&tab=listseries&function=Search')
def thetvdb_names = thetvdb_index.fetch().getHtml('UTF-8')
.depthFirst().TABLE.find{it['@id'] == "listtable"}
.depthFirst().TR.findAll{ it.TD.size() == 3 && it.TD[1].text() == 'English'}
.findResults{ it.TD[0].A.text() }
def thetvdb_names = thetvdb_index.findResults{ it[1] }
// AniDB
def anidb_names = net.sourceforge.filebot.WebServices.AniDB.getAnimeTitles().findResults{ [it.getPrimaryTitle(), it.getOfficialTitle('en')] }.flatten()
@ -141,7 +150,7 @@ dokuwiki_index.getText('UTF-8').eachLine{
def names = [thetvdb_names, anidb_names]
names.each{ if (it.size() == 0) throw new Exception("Failed to scrape series names") } // sanity check
names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it) } // collect and normalize names
names = names.flatten().findAll{ it =~ /^[A-Z0-9]/ && it =~ /[\p{Alpha}]{3}/}.findResults{ net.sourceforge.filebot.similarity.Normalization.normalizePunctuation(it).toLowerCase() } // collect and normalize names
def seriesSorter = new TreeSet(String.CASE_INSENSITIVE_ORDER)
seriesSorter.addAll(names)

View File

@ -2,14 +2,33 @@
package net.sourceforge.filebot;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.sourceforge.filebot.Settings.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.sourceforge.filebot.media.MediaDetection;
import net.sourceforge.filebot.web.AnidbClient;
import net.sourceforge.filebot.web.EpisodeListProvider;
import net.sourceforge.filebot.web.FanartTV;
import net.sourceforge.filebot.web.IMDbClient;
import net.sourceforge.filebot.web.LocalSearch;
import net.sourceforge.filebot.web.MovieIdentificationService;
import net.sourceforge.filebot.web.OpenSubtitlesClient;
import net.sourceforge.filebot.web.SearchResult;
import net.sourceforge.filebot.web.SerienjunkiesClient;
import net.sourceforge.filebot.web.SublightSubtitleClient;
import net.sourceforge.filebot.web.SubsceneSubtitleClient;
@ -28,9 +47,11 @@ public final class WebServices {
// episode dbs
public static final TVRageClient TVRage = new TVRageClient();
public static final AnidbClient AniDB = new AnidbClient(getApplicationName().toLowerCase(), 3);
public static final TheTVDBClient TheTVDB = new TheTVDBClient(getApplicationProperty("thetvdb.apikey"));
public static final SerienjunkiesClient Serienjunkies = new SerienjunkiesClient(getApplicationProperty("serienjunkies.apikey"));
// extended TheTVDB module with local search
public static final TheTVDBClient TheTVDB = new TheTVDBClientWithLocalSearch(getApplicationProperty("thetvdb.apikey"));
// movie dbs
public static final IMDbClient IMDb = new IMDbClient();
public static final TMDbClient TMDb = new TMDbClient(getApplicationProperty("themoviedb.apikey"));
@ -84,6 +105,81 @@ public final class WebServices {
}
private static class TheTVDBClientWithLocalSearch extends TheTVDBClient {
public TheTVDBClientWithLocalSearch(String apikey) {
super(apikey);
}
// index of local thetvdb data dump
private static LocalSearch<SearchResult> localIndex;
private synchronized LocalSearch<SearchResult> getLocalIndex() throws IOException {
if (localIndex == null) {
// fetch data dump
TheTVDBSearchResult[] data = MediaDetection.releaseInfo.getTheTVDBIndex();
// index data dump
localIndex = new LocalSearch<SearchResult>(asList(data)) {
@Override
protected Set<String> getFields(SearchResult object) {
return set(object.getName());
}
};
}
return localIndex;
}
@SuppressWarnings("unchecked")
@Override
public List<SearchResult> fetchSearchResult(final String query, final Locale locale) throws Exception {
Callable<List<SearchResult>> apiSearch = new Callable<List<SearchResult>>() {
@Override
public List<SearchResult> call() throws Exception {
return TheTVDBClientWithLocalSearch.super.fetchSearchResult(query, locale);
}
};
Callable<List<SearchResult>> localSearch = new Callable<List<SearchResult>>() {
@Override
public List<SearchResult> call() throws Exception {
try {
return getLocalIndex().search(query);
} catch (Exception e) {
Logger.getLogger(TheTVDBClientWithLocalSearch.class.getName()).log(Level.SEVERE, e.getMessage(), e);
}
// let local search fail gracefully without affecting API search
return emptyList();
}
};
ExecutorService executor = Executors.newFixedThreadPool(2);
try {
Set<SearchResult> results = new LinkedHashSet<SearchResult>();
for (Future<List<SearchResult>> resultSet : executor.invokeAll(asList(localSearch, apiSearch))) {
try {
results.addAll(resultSet.get());
} catch (ExecutionException e) {
if (e.getCause() instanceof Exception) {
throw (Exception) e.getCause(); // unwrap cause
}
}
}
return new ArrayList<SearchResult>(results);
} finally {
executor.shutdownNow();
}
};
}
/**
* Dummy constructor to prevent instantiation.
*/

View File

@ -37,6 +37,7 @@ import java.util.zip.GZIPInputStream;
import net.sourceforge.filebot.web.CachedResource;
import net.sourceforge.filebot.web.Movie;
import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
import net.sourceforge.tuned.ByteBufferInputStream;
@ -242,6 +243,11 @@ public class ReleaseInfo {
}
public TheTVDBSearchResult[] getTheTVDBIndex() throws IOException {
return theTVDBIndexResource.get();
}
public Map<Pattern, String> getSeriesDirectMappings() throws IOException {
Map<Pattern, String> mappings = new LinkedHashMap<Pattern, String>();
for (String line : seriesDirectMappingsResource.get()) {
@ -268,8 +274,9 @@ public class ReleaseInfo {
protected final CachedResource<String[]> queryBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.query-blacklist"));
protected final CachedResource<String[]> excludeBlacklistResource = new PatternResource(getBundle(getClass().getName()).getString("url.exclude-blacklist"));
protected final CachedResource<Movie[]> movieListResource = new MovieResource(getBundle(getClass().getName()).getString("url.movie-list"));
protected final CachedResource<String[]> seriesListResource = new SeriesResource(getBundle(getClass().getName()).getString("url.series-list"));
protected final CachedResource<String[]> seriesListResource = new SeriesListResource(getBundle(getClass().getName()).getString("url.series-list"));
protected final CachedResource<String[]> seriesDirectMappingsResource = new PatternResource(getBundle(getClass().getName()).getString("url.series-mappings"));
protected final CachedResource<TheTVDBSearchResult[]> theTVDBIndexResource = new TheTVDBIndexResource(getBundle(getClass().getName()).getString("url.thetvdb-index"));
protected static class PatternResource extends CachedResource<String[]> {
@ -310,16 +317,39 @@ public class ReleaseInfo {
}
protected static class SeriesResource extends CachedResource<String[]> {
protected static class SeriesListResource extends CachedResource<String[]> {
public SeriesResource(String resource) {
public SeriesListResource(String resource) {
super(resource, String[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
}
@Override
public String[] process(ByteBuffer data) throws IOException {
return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "utf-8")).split("\\n");
return readAll(new InputStreamReader(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8")).split("\\n");
}
}
protected static class TheTVDBIndexResource extends CachedResource<TheTVDBSearchResult[]> {
public TheTVDBIndexResource(String resource) {
super(resource, TheTVDBSearchResult[].class, 7 * 24 * 60 * 60 * 1000); // check for updates once a week
}
@Override
public TheTVDBSearchResult[] process(ByteBuffer data) throws IOException {
Scanner scanner = new Scanner(new GZIPInputStream(new ByteBufferInputStream(data)), "UTF-8").useDelimiter("\t|\n");
List<TheTVDBSearchResult> tvshows = new ArrayList<TheTVDBSearchResult>();
while (scanner.hasNext()) {
int id = scanner.nextInt();
String name = scanner.next().trim();
tvshows.add(new TheTVDBSearchResult(name, id));
}
return tvshows.toArray(new TheTVDBSearchResult[0]);
}
}

View File

@ -22,5 +22,8 @@ url.movie-list: http://filebot.sourceforge.net/data/movies.txt.gz
# list of tv show and anime names
url.series-list: http://filebot.sourceforge.net/data/series.list.gz
# TheTVDB index
url.thetvdb-index: http://filebot.sourceforge.net/data/thetvdb.txt.gz
# disk folder matcher
pattern.diskfolder.entry: BDMV|HVDVD_TS|VIDEO_TS|AUDIO_TS|VCD|movie.nfo

View File

@ -24,7 +24,7 @@ import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
class LocalSearch<T> {
public class LocalSearch<T> {
private final AbstractStringMetric metric = new QGramsDistance();
private final float resultMinimumSimilarity = 0.5f;
@ -75,9 +75,13 @@ class LocalSearch<T> {
if (entry.get() != null) {
resultSet.add(entry.get());
}
if (Thread.interrupted()) {
throw new InterruptedException();
}
}
} finally {
executor.shutdown();
executor.shutdownNow();
}
// sort by similarity descending (best matches first)

View File

@ -2,3 +2,4 @@ options +indexes
redirect 301 /data/movies.txt.gz http://sourceforge.net/projects/filebot/files/data/movies.txt.gz/download
redirect 301 /data/series.list.gz http://sourceforge.net/projects/filebot/files/data/series.list.gz/download
redirect 301 /data/thetvdb.txt.gz http://sourceforge.net/projects/filebot/files/data/thetvdb.txt.gz/download

View File

@ -1,5 +1 @@
Dark.Matters.Twisted.But.True Dark Matters
Franklin.and.Bash Franklin & Bash
HIMYM How I Met your Mother
Rizolli.and.Isles Rizzoli & Isles
Rizzoli.and.Isles Rizzoli & Isles
HIMYM How I Met your Mother