From 6217589f82fe29db5488a20127204d2555e572ae Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Wed, 24 Jul 2013 04:59:13 +0000 Subject: [PATCH] * imdb page scraper helper for people that really need it --- .../sourceforge/filebot/format/ExpressionFormat.lib.groovy | 2 +- source/net/sourceforge/filebot/web/IMDbClient.java | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy b/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy index e1dfc656..d43ecb36 100644 --- a/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy +++ b/source/net/sourceforge/filebot/format/ExpressionFormat.lib.groovy @@ -47,7 +47,7 @@ String.metaClass.pad = Number.metaClass.pad = { length = 2, padding = "0" -> del * Return a substring matching the given pattern or break. */ String.metaClass.match = { String pattern, matchGroup = null -> - def matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.MULTILINE).matcher(delegate) + def matcher = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.MULTILINE | Pattern.DOTALL).matcher(delegate) if (matcher.find()) return matcher.groupCount() > 0 && matchGroup == null ? matcher.group(1) : matcher.group(matchGroup ?: 0) else diff --git a/source/net/sourceforge/filebot/web/IMDbClient.java b/source/net/sourceforge/filebot/web/IMDbClient.java index 7cb0b49c..4a06700c 100644 --- a/source/net/sourceforge/filebot/web/IMDbClient.java +++ b/source/net/sourceforge/filebot/web/IMDbClient.java @@ -155,6 +155,11 @@ public class IMDbClient implements MovieIdentificationService { } + public String scrape(String imdbid, String xpath) throws IOException, SAXException { + return selectString(xpath, parsePage(getMoviePageLink(getImdbId(imdbid)).toURL())); // helper for scraping data in user scripts + } + + public URI getMoviePageLink(int imdbId) { return URI.create(String.format("http://www.imdb.com/title/tt%07d/", imdbId)); }