mirror of
https://github.com/mitb-archive/filebot
synced 2024-11-17 14:55:09 -05:00
+ removed broken IMDb Episode List Scraper
This commit is contained in:
parent
e1dea3b514
commit
7633260147
@ -41,7 +41,7 @@ public final class WebServices {
|
|||||||
|
|
||||||
|
|
||||||
public static EpisodeListProvider[] getEpisodeListProviders() {
|
public static EpisodeListProvider[] getEpisodeListProviders() {
|
||||||
return new EpisodeListProvider[] { TVRage, AniDB, IMDb, TheTVDB, Serienjunkies };
|
return new EpisodeListProvider[] { TVRage, AniDB, TheTVDB, Serienjunkies };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,8 +7,6 @@ import static net.sourceforge.tuned.XPathUtilities.*;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.net.URLConnection;
|
import java.net.URLConnection;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -26,11 +24,10 @@ import org.w3c.dom.Document;
|
|||||||
import org.w3c.dom.Node;
|
import org.w3c.dom.Node;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import net.sf.ehcache.CacheManager;
|
|
||||||
import net.sourceforge.filebot.ResourceManager;
|
import net.sourceforge.filebot.ResourceManager;
|
||||||
|
|
||||||
|
|
||||||
public class IMDbClient extends AbstractEpisodeListProvider implements MovieIdentificationService {
|
public class IMDbClient implements MovieIdentificationService {
|
||||||
|
|
||||||
private final String host = "www.imdb.com";
|
private final String host = "www.imdb.com";
|
||||||
|
|
||||||
@ -47,73 +44,6 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ResultCache getCache() {
|
|
||||||
return new ResultCache(host, CacheManager.getInstance().getCache("web-datasource"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<SearchResult> fetchSearchResult(String query, Locale locale) throws IOException, SAXException {
|
|
||||||
Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query)));
|
|
||||||
|
|
||||||
List<Node> nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom);
|
|
||||||
List<SearchResult> results = new ArrayList<SearchResult>(nodes.size());
|
|
||||||
|
|
||||||
for (Node node : nodes) {
|
|
||||||
String name = normalizeName(node.getTextContent().trim());
|
|
||||||
String year = node.getNextSibling().getTextContent().trim().replaceAll("\\D+", ""); // remove non-number characters
|
|
||||||
String href = getAttribute("href", node);
|
|
||||||
|
|
||||||
results.add(new Movie(name, Integer.parseInt(year), getImdbId(href)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// we might have been redirected to the movie page
|
|
||||||
if (results.isEmpty()) {
|
|
||||||
Movie movie = scrapeMovie(dom);
|
|
||||||
if (movie != null) {
|
|
||||||
results.add(movie);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Episode> fetchEpisodeList(SearchResult searchResult, Locale locale) throws IOException, SAXException {
|
|
||||||
Movie movie = (Movie) searchResult;
|
|
||||||
Document dom = parsePage(getEpisodeListLink(searchResult).toURL());
|
|
||||||
|
|
||||||
String seriesName = normalizeName(selectString("//H1/A", dom));
|
|
||||||
Date year = new Date(movie.getYear(), 0, 0);
|
|
||||||
|
|
||||||
List<Node> nodes = selectNodes("//TABLE//H3/A[preceding-sibling::text()]", dom);
|
|
||||||
List<Episode> episodes = new ArrayList<Episode>(nodes.size());
|
|
||||||
|
|
||||||
for (Node node : nodes) {
|
|
||||||
String title = getTextContent(node);
|
|
||||||
|
|
||||||
Scanner numberScanner = new Scanner(node.getPreviousSibling().getTextContent()).useDelimiter("\\D+");
|
|
||||||
Integer season = numberScanner.nextInt();
|
|
||||||
Integer episode = numberScanner.nextInt();
|
|
||||||
|
|
||||||
// e.g. 20 May 2003
|
|
||||||
Date airdate = Date.parse(selectString("./following::STRONG", node), "dd MMMMM yyyyy");
|
|
||||||
|
|
||||||
episodes.add(new Episode(seriesName, year, season, episode, title, null, null, airdate));
|
|
||||||
}
|
|
||||||
|
|
||||||
return episodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
protected String normalizeName(String name) {
|
|
||||||
// remove quotation marks
|
|
||||||
return name.replaceAll("\"", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
protected int getImdbId(String link) {
|
protected int getImdbId(String link) {
|
||||||
Matcher matcher = Pattern.compile("tt(\\d{7})").matcher(link);
|
Matcher matcher = Pattern.compile("tt(\\d{7})").matcher(link);
|
||||||
|
|
||||||
@ -126,22 +56,6 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public URI getEpisodeListLink(SearchResult searchResult) {
|
|
||||||
return getEpisodeListLink(searchResult, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public URI getEpisodeListLink(SearchResult searchResult, int season) {
|
|
||||||
try {
|
|
||||||
return new URI("http", host, String.format("/title/tt%07d/episodes", ((Movie) searchResult).getImdbId()), season > 0 ? String.format("season-%d", season) : null);
|
|
||||||
} catch (URISyntaxException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Movie> searchMovie(String query, Locale locale) throws Exception {
|
public List<Movie> searchMovie(String query, Locale locale) throws Exception {
|
||||||
Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query)));
|
Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query)));
|
||||||
@ -162,7 +76,7 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden
|
|||||||
results.add(new Movie(name, Integer.parseInt(year), getImdbId(href)));
|
results.add(new Movie(name, Integer.parseInt(year), getImdbId(href)));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// ignore illegal movies (TV Shows, Videos, Video Games, etc)
|
// ignore illegal movies (TV Shows, Videos, Video Games, etc)
|
||||||
Logger.getLogger(getClass().getName()).log(Level.FINEST, e.getClass().getName() + ": " + e.getMessage());
|
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getClass().getName() + ": " + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,7 +94,7 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden
|
|||||||
|
|
||||||
protected Movie scrapeMovie(Document dom) {
|
protected Movie scrapeMovie(Document dom) {
|
||||||
try {
|
try {
|
||||||
String name = normalizeName(selectString("//H1/text()", dom));
|
String name = selectString("//H1/text()", dom);
|
||||||
String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next();
|
String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next();
|
||||||
String url = selectString("//LINK[@rel='canonical']/@href", dom);
|
String url = selectString("//LINK[@rel='canonical']/@href", dom);
|
||||||
return new Movie(name, Integer.parseInt(year), getImdbId(url));
|
return new Movie(name, Integer.parseInt(year), getImdbId(url));
|
||||||
|
@ -14,96 +14,6 @@ public class IMDbClientTest {
|
|||||||
private final IMDbClient imdb = new IMDbClient();
|
private final IMDbClient imdb = new IMDbClient();
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void search() throws Exception {
|
|
||||||
List<SearchResult> results = imdb.search("battlestar");
|
|
||||||
|
|
||||||
Movie movie = (Movie) results.get(0);
|
|
||||||
|
|
||||||
assertEquals("Battlestar Galactica", movie.getName());
|
|
||||||
assertEquals(2004, movie.getYear());
|
|
||||||
assertEquals(407362, movie.getImdbId(), 0);
|
|
||||||
|
|
||||||
assertEquals(8, results.size(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void searchMiniSeries() throws Exception {
|
|
||||||
List<SearchResult> results = imdb.search("generation kill");
|
|
||||||
|
|
||||||
Movie movie = (Movie) results.get(0);
|
|
||||||
|
|
||||||
assertEquals("Generation Kill", movie.getName());
|
|
||||||
assertEquals(2008, movie.getYear());
|
|
||||||
assertEquals(995832, movie.getImdbId(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void searchNoMatch() throws Exception {
|
|
||||||
List<SearchResult> results = imdb.search("i will not find anything for this query string");
|
|
||||||
|
|
||||||
assertTrue(results.isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void searchResultPageRedirect() throws Exception {
|
|
||||||
List<SearchResult> results = imdb.search("my name is earl");
|
|
||||||
|
|
||||||
// exactly one search result
|
|
||||||
assertEquals(1, results.size(), 0);
|
|
||||||
|
|
||||||
Movie movie = (Movie) results.get(0);
|
|
||||||
|
|
||||||
assertEquals("My Name Is Earl", movie.getName());
|
|
||||||
assertEquals(460091, movie.getImdbId(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void getEpisodeList() throws Exception {
|
|
||||||
List<Episode> list = imdb.getEpisodeList(new Movie("Buffy", 1997, 118276));
|
|
||||||
|
|
||||||
assertEquals(145, list.size());
|
|
||||||
|
|
||||||
Episode first = list.get(0);
|
|
||||||
|
|
||||||
assertEquals("Buffy the Vampire Slayer", first.getSeriesName());
|
|
||||||
assertEquals("1997-00-00", first.getSeriesStartDate().toString());
|
|
||||||
assertEquals("Unaired Pilot", first.getTitle());
|
|
||||||
assertEquals("0", first.getEpisode().toString());
|
|
||||||
assertEquals("1", first.getSeason().toString());
|
|
||||||
assertEquals(null, first.airdate());
|
|
||||||
|
|
||||||
Episode last = list.get(144);
|
|
||||||
|
|
||||||
assertEquals("Buffy the Vampire Slayer", last.getSeriesName());
|
|
||||||
assertEquals("1997-00-00", first.getSeriesStartDate().toString());
|
|
||||||
assertEquals("Chosen", last.getTitle());
|
|
||||||
assertEquals("22", last.getEpisode().toString());
|
|
||||||
assertEquals("7", last.getSeason().toString());
|
|
||||||
assertEquals("2003-05-20", last.airdate().toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void getEpisodeListWithUnknownSeason() throws Exception {
|
|
||||||
List<Episode> list = imdb.getEpisodeList(new Movie("Mushishi", 2005, 807832));
|
|
||||||
|
|
||||||
assertEquals(26, list.size());
|
|
||||||
|
|
||||||
Episode first = list.get(0);
|
|
||||||
|
|
||||||
assertEquals("Mushi-Shi", first.getSeriesName());
|
|
||||||
assertEquals("2005-00-00", first.getSeriesStartDate().toString());
|
|
||||||
assertEquals("Midori no za", first.getTitle());
|
|
||||||
assertEquals("1", first.getEpisode().toString());
|
|
||||||
assertEquals("1", first.getSeason().toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void searchMovie() throws Exception {
|
public void searchMovie() throws Exception {
|
||||||
List<Movie> results = imdb.searchMovie("Avatar", null);
|
List<Movie> results = imdb.searchMovie("Avatar", null);
|
||||||
@ -139,10 +49,4 @@ public class IMDbClientTest {
|
|||||||
assertEquals(499549, movie.getImdbId(), 0);
|
assertEquals(499549, movie.getImdbId(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void getEpisodeListLink() throws Exception {
|
|
||||||
assertEquals("http://www.imdb.com/title/tt0407362/episodes", imdb.getEpisodeListLink(new Movie("Battlestar Galactica", 2004, 407362)).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user