From 7633260147d0a0b48b8e2547d4274a66f665adec Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Wed, 8 Feb 2012 08:45:32 +0000 Subject: [PATCH] + removed broken IMDb Episode List Scraper --- .../net/sourceforge/filebot/WebServices.java | 2 +- .../sourceforge/filebot/web/IMDbClient.java | 92 +----------------- .../filebot/web/IMDbClientTest.java | 96 ------------------- 3 files changed, 4 insertions(+), 186 deletions(-) diff --git a/source/net/sourceforge/filebot/WebServices.java b/source/net/sourceforge/filebot/WebServices.java index 8d6fb54c..303b33b7 100644 --- a/source/net/sourceforge/filebot/WebServices.java +++ b/source/net/sourceforge/filebot/WebServices.java @@ -41,7 +41,7 @@ public final class WebServices { public static EpisodeListProvider[] getEpisodeListProviders() { - return new EpisodeListProvider[] { TVRage, AniDB, IMDb, TheTVDB, Serienjunkies }; + return new EpisodeListProvider[] { TVRage, AniDB, TheTVDB, Serienjunkies }; } diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index 971d6b04..21f535c5 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -7,8 +7,6 @@ import static net.sourceforge.tuned.XPathUtilities.*; import java.io.File; import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; @@ -26,11 +24,10 @@ import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.SAXException; -import net.sf.ehcache.CacheManager; import net.sourceforge.filebot.ResourceManager; -public class IMDbClient extends AbstractEpisodeListProvider implements MovieIdentificationService { +public class IMDbClient implements MovieIdentificationService { private final String host = "www.imdb.com"; @@ -47,73 +44,6 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden } - @Override - public ResultCache getCache() { - return new ResultCache(host, CacheManager.getInstance().getCache("web-datasource")); - } - - - @Override - public List fetchSearchResult(String query, Locale locale) throws IOException, SAXException { - Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query))); - - List nodes = selectNodes("//TABLE//A[following-sibling::SMALL[contains(.,'series')]]", dom); - List results = new ArrayList(nodes.size()); - - for (Node node : nodes) { - String name = normalizeName(node.getTextContent().trim()); - String year = node.getNextSibling().getTextContent().trim().replaceAll("\\D+", ""); // remove non-number characters - String href = getAttribute("href", node); - - results.add(new Movie(name, Integer.parseInt(year), getImdbId(href))); - } - - // we might have been redirected to the movie page - if (results.isEmpty()) { - Movie movie = scrapeMovie(dom); - if (movie != null) { - results.add(movie); - } - } - - return results; - } - - - @Override - public List fetchEpisodeList(SearchResult searchResult, Locale locale) throws IOException, SAXException { - Movie movie = (Movie) searchResult; - Document dom = parsePage(getEpisodeListLink(searchResult).toURL()); - - String seriesName = normalizeName(selectString("//H1/A", dom)); - Date year = new Date(movie.getYear(), 0, 0); - - List nodes = selectNodes("//TABLE//H3/A[preceding-sibling::text()]", dom); - List episodes = new ArrayList(nodes.size()); - - for (Node node : nodes) { - String title = getTextContent(node); - - Scanner numberScanner = new Scanner(node.getPreviousSibling().getTextContent()).useDelimiter("\\D+"); - Integer season = numberScanner.nextInt(); - Integer episode = numberScanner.nextInt(); - - // e.g. 20 May 2003 - Date airdate = Date.parse(selectString("./following::STRONG", node), "dd MMMMM yyyyy"); - - episodes.add(new Episode(seriesName, year, season, episode, title, null, null, airdate)); - } - - return episodes; - } - - - protected String normalizeName(String name) { - // remove quotation marks - return name.replaceAll("\"", ""); - } - - protected int getImdbId(String link) { Matcher matcher = Pattern.compile("tt(\\d{7})").matcher(link); @@ -126,22 +56,6 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden } - @Override - public URI getEpisodeListLink(SearchResult searchResult) { - return getEpisodeListLink(searchResult, 0); - } - - - @Override - public URI getEpisodeListLink(SearchResult searchResult, int season) { - try { - return new URI("http", host, String.format("/title/tt%07d/episodes", ((Movie) searchResult).getImdbId()), season > 0 ? String.format("season-%d", season) : null); - } catch (URISyntaxException e) { - throw new RuntimeException(e); - } - } - - @Override public List searchMovie(String query, Locale locale) throws Exception { Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query))); @@ -162,7 +76,7 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden results.add(new Movie(name, Integer.parseInt(year), getImdbId(href))); } catch (Exception e) { // ignore illegal movies (TV Shows, Videos, Video Games, etc) - Logger.getLogger(getClass().getName()).log(Level.FINEST, e.getClass().getName() + ": " + e.getMessage()); + Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getClass().getName() + ": " + e.getMessage()); } } @@ -180,7 +94,7 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden protected Movie scrapeMovie(Document dom) { try { - String name = normalizeName(selectString("//H1/text()", dom)); + String name = selectString("//H1/text()", dom); String year = new Scanner(selectString("//H1//SPAN", dom)).useDelimiter("\\D+").next(); String url = selectString("//LINK[@rel='canonical']/@href", dom); return new Movie(name, Integer.parseInt(year), getImdbId(url)); diff --git a/test/net/sourceforge/filebot/web/IMDbClientTest.java b/test/net/sourceforge/filebot/web/IMDbClientTest.java index 809c3d29..f15432bd 100644 --- a/test/net/sourceforge/filebot/web/IMDbClientTest.java +++ b/test/net/sourceforge/filebot/web/IMDbClientTest.java @@ -14,96 +14,6 @@ public class IMDbClientTest { private final IMDbClient imdb = new IMDbClient(); - @Test - public void search() throws Exception { - List results = imdb.search("battlestar"); - - Movie movie = (Movie) results.get(0); - - assertEquals("Battlestar Galactica", movie.getName()); - assertEquals(2004, movie.getYear()); - assertEquals(407362, movie.getImdbId(), 0); - - assertEquals(8, results.size(), 0); - } - - - @Test - public void searchMiniSeries() throws Exception { - List results = imdb.search("generation kill"); - - Movie movie = (Movie) results.get(0); - - assertEquals("Generation Kill", movie.getName()); - assertEquals(2008, movie.getYear()); - assertEquals(995832, movie.getImdbId(), 0); - } - - - @Test - public void searchNoMatch() throws Exception { - List results = imdb.search("i will not find anything for this query string"); - - assertTrue(results.isEmpty()); - } - - - @Test - public void searchResultPageRedirect() throws Exception { - List results = imdb.search("my name is earl"); - - // exactly one search result - assertEquals(1, results.size(), 0); - - Movie movie = (Movie) results.get(0); - - assertEquals("My Name Is Earl", movie.getName()); - assertEquals(460091, movie.getImdbId(), 0); - } - - - @Test - public void getEpisodeList() throws Exception { - List list = imdb.getEpisodeList(new Movie("Buffy", 1997, 118276)); - - assertEquals(145, list.size()); - - Episode first = list.get(0); - - assertEquals("Buffy the Vampire Slayer", first.getSeriesName()); - assertEquals("1997-00-00", first.getSeriesStartDate().toString()); - assertEquals("Unaired Pilot", first.getTitle()); - assertEquals("0", first.getEpisode().toString()); - assertEquals("1", first.getSeason().toString()); - assertEquals(null, first.airdate()); - - Episode last = list.get(144); - - assertEquals("Buffy the Vampire Slayer", last.getSeriesName()); - assertEquals("1997-00-00", first.getSeriesStartDate().toString()); - assertEquals("Chosen", last.getTitle()); - assertEquals("22", last.getEpisode().toString()); - assertEquals("7", last.getSeason().toString()); - assertEquals("2003-05-20", last.airdate().toString()); - } - - - @Test - public void getEpisodeListWithUnknownSeason() throws Exception { - List list = imdb.getEpisodeList(new Movie("Mushishi", 2005, 807832)); - - assertEquals(26, list.size()); - - Episode first = list.get(0); - - assertEquals("Mushi-Shi", first.getSeriesName()); - assertEquals("2005-00-00", first.getSeriesStartDate().toString()); - assertEquals("Midori no za", first.getTitle()); - assertEquals("1", first.getEpisode().toString()); - assertEquals("1", first.getSeason().toString()); - } - - @Test public void searchMovie() throws Exception { List results = imdb.searchMovie("Avatar", null); @@ -139,10 +49,4 @@ public class IMDbClientTest { assertEquals(499549, movie.getImdbId(), 0); } - - @Test - public void getEpisodeListLink() throws Exception { - assertEquals("http://www.imdb.com/title/tt0407362/episodes", imdb.getEpisodeListLink(new Movie("Battlestar Galactica", 2004, 407362)).toString()); - } - }