From 8fa867ae494ea78f37489b7ab3081ff9964a1c2f Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 16 Jul 2012 10:06:40 +0000 Subject: [PATCH] * update website scraper to subscene v3 --- .../filebot/web/SubsceneSubtitleClient.java | 116 ++++++++---------- .../web/SubsceneSubtitleDescriptor.java | 27 ++-- .../web/SubsceneSubtitleClientTest.java | 22 ++-- 3 files changed, 69 insertions(+), 96 deletions(-) diff --git a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java index 7761e336..44a3e74d 100644 --- a/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java +++ b/source/net/sourceforge/filebot/web/SubsceneSubtitleClient.java @@ -55,10 +55,10 @@ public class SubsceneSubtitleClient implements SubtitleProvider { @Override public List search(String query) throws IOException, SAXException { - URL searchUrl = new URL("http", host, "/filmsearch.aspx?q=" + encode(query)); + URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query)); Document dom = getHtmlDocument(searchUrl); - List nodes = selectNodes("id('filmSearch')/A", dom); + List nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom); List searchResults = new ArrayList(nodes.size()); Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$"); @@ -77,23 +77,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider { } } - // we might have been redirected to the subtitle list - if (searchResults.isEmpty()) { - try { - // get name of current search result - String name = selectString("id('leftWrapperWide')//H1/text()", dom); - - // get current location - String file = selectString("id('aspnetForm')/@action", dom); - - if (!name.isEmpty() && !file.isEmpty()) { - searchResults.add(new HyperLink(name, new URL("http", host, file))); - } - } catch (Exception e) { - Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle page: " + searchUrl, e); - } - } - return searchResults; } @@ -102,32 +85,21 @@ public class SubsceneSubtitleClient implements SubtitleProvider { public List getSubtitleList(SearchResult searchResult, String languageName) throws Exception { URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL(); - String languageFilter = getLanguageFilter(languageName); - Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter); + String filter = getLanguageFilter(languageName); + Document dom = getSubtitleListDocument(subtitleListUrl, filter); - // let's update language filters if they are not known yet - if (languageName != null && languageFilter == null) { - updateLanguageFilterMap(subtitleListDocument); - } - - return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument); - } - - - private List getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) { - List nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument); - List subtitles = new ArrayList(nodes.size()); - - for (Node node : nodes) { + List rows = selectNodes("//TD[@class='a1']", dom); + List subtitles = new ArrayList(); + for (Node row : rows) { try { - String lang = getTextContent(getChildren("SPAN", node).get(0)); + List fields = selectNodes(".//SPAN", row); + String language = getTextContent(fields.get(0)); - if (languageName == null || languageName.equalsIgnoreCase(lang)) { - String name = getTextContent(getChildren("SPAN", node).get(1)); - String href = getAttribute("href", node); + if (languageName == null || language.equalsIgnoreCase(languageName)) { + String name = getTextContent(fields.get(1)); + String href = selectString(".//A/@href", row); URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href); - - subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage)); + subtitles.add(new SubsceneSubtitleDescriptor(name, language, subtitlePage)); } } catch (Exception e) { Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e); @@ -142,55 +114,63 @@ public class SubsceneSubtitleClient implements SubtitleProvider { URLConnection connection = subtitleListUrl.openConnection(); if (languageFilter != null) { - connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + languageFilter); + connection.addRequestProperty("Cookie", "Filter=" + languageFilter); } return getHtmlDocument(connection); } - protected String getLanguageFilter(String languageName) { + protected String getLanguageFilter(String languageName) throws IOException, SAXException { if (languageName == null || languageName.isEmpty()) { return null; } + // try cache first Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource"); String cacheKey = getClass().getName() + ".languageFilter"; - Element element = cache.get(cacheKey); - if (element == null) { - return null; - } - - return (String) ((Map) element.getValue()).get(languageName.toLowerCase()); - } - - - protected Map updateLanguageFilterMap(Document subtitleListDocument) { - Map filters = new HashMap(50); - List nodes = selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument); - - for (Node node : nodes) { - // select INPUT/@onclick, then ditch non-number-characters - String filter = getAttribute("onclick", getChild("INPUT", node)).replaceAll("\\D+", ""); - - if (filter != null) { - // select LABEL/text() - String name = getTextContent("LABEL", node); - - filters.put(name.toLowerCase(), filter); + try { + Element element = cache.get(cacheKey); + if (element != null) { + return (String) ((Map) element.getValue()).get(languageName.toLowerCase()); } + } catch (Exception e) { + Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage()); } + // fetch new language filter data + Map filters = getLanguageFilterMap(); + // update cache after sanity check if (filters.size() > 42) { - Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource"); - String cacheKey = getClass().getName() + ".languageFilter"; - cache.put(new Element(cacheKey, filters)); + try { + cache.put(new Element(cacheKey, filters)); + } catch (Exception e) { + Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage()); + } } else { Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters); } + return filters.get(languageName.toLowerCase()); + } + + + protected Map getLanguageFilterMap() throws IOException, SAXException { + Map filters = new HashMap(50); + + Document dom = getHtmlDocument(new URL("http://subscene.com/filter")); + List checkboxes = selectNodes("//INPUT[@type='checkbox']", dom); + + for (Node checkbox : checkboxes) { + String filter = getAttribute("value", checkbox); + if (filter != null) { + String name = selectString("./following::LABEL", checkbox); + filters.put(name.toLowerCase(), filter); + } + } + return filters; } diff --git a/source/net/sourceforge/filebot/web/SubsceneSubtitleDescriptor.java b/source/net/sourceforge/filebot/web/SubsceneSubtitleDescriptor.java index e36be5a0..6f0c97ee 100644 --- a/source/net/sourceforge/filebot/web/SubsceneSubtitleDescriptor.java +++ b/source/net/sourceforge/filebot/web/SubsceneSubtitleDescriptor.java @@ -12,8 +12,7 @@ import java.util.HashMap; import java.util.Map; import org.w3c.dom.Document; - -import net.sourceforge.tuned.FileUtilities; +import org.w3c.dom.Node; public class SubsceneSubtitleDescriptor implements SubtitleDescriptor { @@ -52,42 +51,36 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor { @Override public ByteBuffer fetch() throws Exception { - // e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx - String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile()); - String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId")); + URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download"); - URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path); - Map downloadPostData = subtitleInfo; - - HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection(); + HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection(); connection.addRequestProperty("Referer", subtitlePage.toString()); - return WebRequest.post(connection, downloadPostData); + return WebRequest.post(connection, getSubtitleInfo()); } private synchronized Map getSubtitleInfo() { // extract subtitle information from subtitle page if necessary if (subtitleInfo == null) { + subtitleInfo = new HashMap(); try { Document dom = getHtmlDocument(subtitlePage); - - subtitleInfo = new HashMap(); - subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom)); - subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom)); - subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom)); + for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) { + subtitleInfo.put(getAttribute("name", input), getAttribute("value", input)); + } } catch (Exception e) { + e.printStackTrace(); throw new RuntimeException("Failed to extract subtitle info", e); } } - return subtitleInfo; } @Override public String getPath() { - return String.format("%s.%s", getName(), subtitleInfo == null ? null : subtitleInfo.get("typeId")); + return getName(); } diff --git a/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java b/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java index 8886aadc..fc2fa89c 100644 --- a/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java +++ b/test/net/sourceforge/filebot/web/SubsceneSubtitleClientTest.java @@ -30,8 +30,8 @@ public class SubsceneSubtitleClientTest { @BeforeClass public static void setUpBeforeClass() throws Exception { - twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/Twin-Peaks-First-Season/subtitles-32482.aspx")); - lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx")); + twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/subtitles/twin-peaks-first-season")); + lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/subtitles/lost-fourth-season")); } @@ -42,7 +42,7 @@ public class SubsceneSubtitleClientTest { public void search() throws Exception { List results = subscene.search("twin peaks"); - SubsceneSearchResult result = (SubsceneSearchResult) results.get(1); + SubsceneSearchResult result = (SubsceneSearchResult) results.get(0); assertEquals(twinpeaksSearchResult.toString(), result.toString()); assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString()); assertEquals(twinpeaksSearchResult.getName(), result.getName()); @@ -50,14 +50,14 @@ public class SubsceneSubtitleClientTest { @Test - public void searchResultPageRedirect() throws Exception { - List results = subscene.search("firefly"); - assertEquals(2, results.size()); + public void search2() throws Exception { + List results = subscene.search("Avatar 2009"); + assertEquals(3, results.size()); SubsceneSearchResult result = (SubsceneSearchResult) results.get(0); assertEquals("Firefly - The Complete Series (2002)", result.toString()); assertEquals("Firefly", result.getName()); - assertEquals("http://subscene.com/Firefly-The-Complete-Series/subtitles-20008.aspx", result.getURL().toString()); + assertEquals("http://subscene.com/subtitles/firefly-the-complete-series", result.getURL().toString()); } @@ -67,7 +67,7 @@ public class SubsceneSubtitleClientTest { assertEquals(10, subtitleList.size()); SubtitleDescriptor subtitle = subtitleList.get(0); - assertEquals("Twin Peaks - First Season", subtitle.getName()); + assertEquals("Twin-Peaks-S01E00-Pilot-eAlternate-ita sub by IScrew [www.ITALIANSHARE.net]", subtitle.getName()); assertEquals("Italian", subtitle.getLanguageName()); } @@ -83,7 +83,7 @@ public class SubsceneSubtitleClientTest { @Test public void getLanguageFilterMap() throws Exception { - Map filters = subscene.updateLanguageFilterMap(subscene.getSubtitleListDocument(new URL("http://subscene.com/none/subtitles-0.aspx"), null)); + Map filters = subscene.getLanguageFilterMap(); assertEquals("1", filters.get("albanian")); assertEquals("13", filters.get("english")); @@ -101,8 +101,8 @@ public class SubsceneSubtitleClientTest { @Test public void downloadSubtitleArchive() throws Exception { SearchResult selectedResult = subscene.search("firefly").get(0); - SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1); - assertEquals("Firefly - The Complete Series", subtitleDescriptor.getName()); + SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0); + assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName()); ByteBuffer archive = subtitleDescriptor.fetch(); assertEquals(254549, archive.remaining());