1
0
mirror of https://github.com/mitb-archive/filebot synced 2024-08-13 17:03:45 -04:00

* update website scraper to subscene v3

This commit is contained in:
Reinhard Pointner 2012-07-16 10:06:40 +00:00
parent bbe5e27339
commit 8fa867ae49
3 changed files with 69 additions and 96 deletions

View File

@ -55,10 +55,10 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
@Override @Override
public List<SearchResult> search(String query) throws IOException, SAXException { public List<SearchResult> search(String query) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/filmsearch.aspx?q=" + encode(query)); URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
Document dom = getHtmlDocument(searchUrl); Document dom = getHtmlDocument(searchUrl);
List<Node> nodes = selectNodes("id('filmSearch')/A", dom); List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);
List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size()); List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size());
Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$"); Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$");
@ -77,23 +77,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
} }
} }
// we might have been redirected to the subtitle list
if (searchResults.isEmpty()) {
try {
// get name of current search result
String name = selectString("id('leftWrapperWide')//H1/text()", dom);
// get current location
String file = selectString("id('aspnetForm')/@action", dom);
if (!name.isEmpty() && !file.isEmpty()) {
searchResults.add(new HyperLink(name, new URL("http", host, file)));
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle page: " + searchUrl, e);
}
}
return searchResults; return searchResults;
} }
@ -102,32 +85,21 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception { public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception {
URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL(); URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL();
String languageFilter = getLanguageFilter(languageName); String filter = getLanguageFilter(languageName);
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter); Document dom = getSubtitleListDocument(subtitleListUrl, filter);
// let's update language filters if they are not known yet List<Node> rows = selectNodes("//TD[@class='a1']", dom);
if (languageName != null && languageFilter == null) { List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
updateLanguageFilterMap(subtitleListDocument); for (Node row : rows) {
}
return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
}
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) {
try { try {
String lang = getTextContent(getChildren("SPAN", node).get(0)); List<Node> fields = selectNodes(".//SPAN", row);
String language = getTextContent(fields.get(0));
if (languageName == null || languageName.equalsIgnoreCase(lang)) { if (languageName == null || language.equalsIgnoreCase(languageName)) {
String name = getTextContent(getChildren("SPAN", node).get(1)); String name = getTextContent(fields.get(1));
String href = getAttribute("href", node); String href = selectString(".//A/@href", row);
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href); URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
subtitles.add(new SubsceneSubtitleDescriptor(name, language, subtitlePage));
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
} }
} catch (Exception e) { } catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e); Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
@ -142,55 +114,63 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
URLConnection connection = subtitleListUrl.openConnection(); URLConnection connection = subtitleListUrl.openConnection();
if (languageFilter != null) { if (languageFilter != null) {
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + languageFilter); connection.addRequestProperty("Cookie", "Filter=" + languageFilter);
} }
return getHtmlDocument(connection); return getHtmlDocument(connection);
} }
protected String getLanguageFilter(String languageName) { protected String getLanguageFilter(String languageName) throws IOException, SAXException {
if (languageName == null || languageName.isEmpty()) { if (languageName == null || languageName.isEmpty()) {
return null; return null;
} }
// try cache first
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource"); Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
String cacheKey = getClass().getName() + ".languageFilter"; String cacheKey = getClass().getName() + ".languageFilter";
try {
Element element = cache.get(cacheKey); Element element = cache.get(cacheKey);
if (element == null) { if (element != null) {
return null;
}
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase()); return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
} }
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
protected Map<String, String> updateLanguageFilterMap(Document subtitleListDocument) {
Map<String, String> filters = new HashMap<String, String>(50);
List<Node> nodes = selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
for (Node node : nodes) {
// select INPUT/@onclick, then ditch non-number-characters
String filter = getAttribute("onclick", getChild("INPUT", node)).replaceAll("\\D+", "");
if (filter != null) {
// select LABEL/text()
String name = getTextContent("LABEL", node);
filters.put(name.toLowerCase(), filter);
}
} }
// fetch new language filter data
Map<String, String> filters = getLanguageFilterMap();
// update cache after sanity check // update cache after sanity check
if (filters.size() > 42) { if (filters.size() > 42) {
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource"); try {
String cacheKey = getClass().getName() + ".languageFilter";
cache.put(new Element(cacheKey, filters)); cache.put(new Element(cacheKey, filters));
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
}
} else { } else {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters); Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters);
} }
return filters.get(languageName.toLowerCase());
}
protected Map<String, String> getLanguageFilterMap() throws IOException, SAXException {
Map<String, String> filters = new HashMap<String, String>(50);
Document dom = getHtmlDocument(new URL("http://subscene.com/filter"));
List<Node> checkboxes = selectNodes("//INPUT[@type='checkbox']", dom);
for (Node checkbox : checkboxes) {
String filter = getAttribute("value", checkbox);
if (filter != null) {
String name = selectString("./following::LABEL", checkbox);
filters.put(name.toLowerCase(), filter);
}
}
return filters; return filters;
} }

View File

@ -12,8 +12,7 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Node;
import net.sourceforge.tuned.FileUtilities;
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor { public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@ -52,42 +51,36 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@Override @Override
public ByteBuffer fetch() throws Exception { public ByteBuffer fetch() throws Exception {
// e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download");
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path); HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection();
Map<String, String> downloadPostData = subtitleInfo;
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
connection.addRequestProperty("Referer", subtitlePage.toString()); connection.addRequestProperty("Referer", subtitlePage.toString());
return WebRequest.post(connection, downloadPostData); return WebRequest.post(connection, getSubtitleInfo());
} }
private synchronized Map<String, String> getSubtitleInfo() { private synchronized Map<String, String> getSubtitleInfo() {
// extract subtitle information from subtitle page if necessary // extract subtitle information from subtitle page if necessary
if (subtitleInfo == null) { if (subtitleInfo == null) {
subtitleInfo = new HashMap<String, String>();
try { try {
Document dom = getHtmlDocument(subtitlePage); Document dom = getHtmlDocument(subtitlePage);
for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) {
subtitleInfo = new HashMap<String, String>(); subtitleInfo.put(getAttribute("name", input), getAttribute("value", input));
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom)); }
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Failed to extract subtitle info", e); throw new RuntimeException("Failed to extract subtitle info", e);
} }
} }
return subtitleInfo; return subtitleInfo;
} }
@Override @Override
public String getPath() { public String getPath() {
return String.format("%s.%s", getName(), subtitleInfo == null ? null : subtitleInfo.get("typeId")); return getName();
} }

View File

@ -30,8 +30,8 @@ public class SubsceneSubtitleClientTest {
@BeforeClass @BeforeClass
public static void setUpBeforeClass() throws Exception { public static void setUpBeforeClass() throws Exception {
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/Twin-Peaks-First-Season/subtitles-32482.aspx")); twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/subtitles/twin-peaks-first-season"));
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx")); lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/subtitles/lost-fourth-season"));
} }
@ -42,7 +42,7 @@ public class SubsceneSubtitleClientTest {
public void search() throws Exception { public void search() throws Exception {
List<SearchResult> results = subscene.search("twin peaks"); List<SearchResult> results = subscene.search("twin peaks");
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1); SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
assertEquals(twinpeaksSearchResult.toString(), result.toString()); assertEquals(twinpeaksSearchResult.toString(), result.toString());
assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString()); assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString());
assertEquals(twinpeaksSearchResult.getName(), result.getName()); assertEquals(twinpeaksSearchResult.getName(), result.getName());
@ -50,14 +50,14 @@ public class SubsceneSubtitleClientTest {
@Test @Test
public void searchResultPageRedirect() throws Exception { public void search2() throws Exception {
List<SearchResult> results = subscene.search("firefly"); List<SearchResult> results = subscene.search("Avatar 2009");
assertEquals(2, results.size()); assertEquals(3, results.size());
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0); SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
assertEquals("Firefly - The Complete Series (2002)", result.toString()); assertEquals("Firefly - The Complete Series (2002)", result.toString());
assertEquals("Firefly", result.getName()); assertEquals("Firefly", result.getName());
assertEquals("http://subscene.com/Firefly-The-Complete-Series/subtitles-20008.aspx", result.getURL().toString()); assertEquals("http://subscene.com/subtitles/firefly-the-complete-series", result.getURL().toString());
} }
@ -67,7 +67,7 @@ public class SubsceneSubtitleClientTest {
assertEquals(10, subtitleList.size()); assertEquals(10, subtitleList.size());
SubtitleDescriptor subtitle = subtitleList.get(0); SubtitleDescriptor subtitle = subtitleList.get(0);
assertEquals("Twin Peaks - First Season", subtitle.getName()); assertEquals("Twin-Peaks-S01E00-Pilot-eAlternate-ita sub by IScrew [www.ITALIANSHARE.net]", subtitle.getName());
assertEquals("Italian", subtitle.getLanguageName()); assertEquals("Italian", subtitle.getLanguageName());
} }
@ -83,7 +83,7 @@ public class SubsceneSubtitleClientTest {
@Test @Test
public void getLanguageFilterMap() throws Exception { public void getLanguageFilterMap() throws Exception {
Map<String, String> filters = subscene.updateLanguageFilterMap(subscene.getSubtitleListDocument(new URL("http://subscene.com/none/subtitles-0.aspx"), null)); Map<String, String> filters = subscene.getLanguageFilterMap();
assertEquals("1", filters.get("albanian")); assertEquals("1", filters.get("albanian"));
assertEquals("13", filters.get("english")); assertEquals("13", filters.get("english"));
@ -101,8 +101,8 @@ public class SubsceneSubtitleClientTest {
@Test @Test
public void downloadSubtitleArchive() throws Exception { public void downloadSubtitleArchive() throws Exception {
SearchResult selectedResult = subscene.search("firefly").get(0); SearchResult selectedResult = subscene.search("firefly").get(0);
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1); SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0);
assertEquals("Firefly - The Complete Series", subtitleDescriptor.getName()); assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName());
ByteBuffer archive = subtitleDescriptor.fetch(); ByteBuffer archive = subtitleDescriptor.fetch();
assertEquals(254549, archive.remaining()); assertEquals(254549, archive.remaining());