From e1dea3b51449b114355e1b4cf95a6e875c51f720 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sat, 7 Jan 2012 16:42:12 +0000 Subject: [PATCH] * try to make imdb scraper more robust --- .../sourceforge/filebot/web/IMDbClient.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index d3bd78ea..971d6b04 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -15,6 +15,8 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Scanner; +import java.util.logging.Level; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -145,18 +147,22 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query))); // select movie links followed by year in parenthesis - List nodes = selectNodes("//TABLE//A[string-length(substring-after(substring-before(following::text(),')'),'(')) = 4 and count(following-sibling::SMALL) = 0]", dom); + List nodes = selectNodes("//TABLE//A[substring-after(substring-before(following::text(),')'),'(')]", dom); List results = new ArrayList(nodes.size()); for (Node node : nodes) { - String name = node.getTextContent().trim(); - String year = node.getNextSibling().getTextContent().trim().replaceAll("[\\p{Punct}\\p{Space}]+", ""); // remove non-number characters - String href = getAttribute("href", node); - try { + String name = node.getTextContent().trim(); + if (name.startsWith("\"")) + continue; + + String year = node.getNextSibling().getTextContent().replaceAll("[\\p{Punct}\\p{Space}]+", "").trim(); // remove non-number characters + String href = getAttribute("href", node); + results.add(new Movie(name, Integer.parseInt(year), getImdbId(href))); - } catch (NumberFormatException e) { + } catch (Exception e) { // ignore illegal movies (TV Shows, Videos, Video Games, etc) + Logger.getLogger(getClass().getName()).log(Level.FINEST, e.getClass().getName() + ": " + e.getMessage()); } }