mirror of
https://github.com/mitb-archive/filebot
synced 2024-08-13 17:03:45 -04:00
* update website scraper to subscene v3
This commit is contained in:
parent
bbe5e27339
commit
8fa867ae49
@ -55,10 +55,10 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
||||
|
||||
@Override
|
||||
public List<SearchResult> search(String query) throws IOException, SAXException {
|
||||
URL searchUrl = new URL("http", host, "/filmsearch.aspx?q=" + encode(query));
|
||||
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
|
||||
Document dom = getHtmlDocument(searchUrl);
|
||||
|
||||
List<Node> nodes = selectNodes("id('filmSearch')/A", dom);
|
||||
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);
|
||||
List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size());
|
||||
|
||||
Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$");
|
||||
@ -77,23 +77,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
||||
}
|
||||
}
|
||||
|
||||
// we might have been redirected to the subtitle list
|
||||
if (searchResults.isEmpty()) {
|
||||
try {
|
||||
// get name of current search result
|
||||
String name = selectString("id('leftWrapperWide')//H1/text()", dom);
|
||||
|
||||
// get current location
|
||||
String file = selectString("id('aspnetForm')/@action", dom);
|
||||
|
||||
if (!name.isEmpty() && !file.isEmpty()) {
|
||||
searchResults.add(new HyperLink(name, new URL("http", host, file)));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle page: " + searchUrl, e);
|
||||
}
|
||||
}
|
||||
|
||||
return searchResults;
|
||||
}
|
||||
|
||||
@ -102,32 +85,21 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
||||
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception {
|
||||
URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL();
|
||||
|
||||
String languageFilter = getLanguageFilter(languageName);
|
||||
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
|
||||
String filter = getLanguageFilter(languageName);
|
||||
Document dom = getSubtitleListDocument(subtitleListUrl, filter);
|
||||
|
||||
// let's update language filters if they are not known yet
|
||||
if (languageName != null && languageFilter == null) {
|
||||
updateLanguageFilterMap(subtitleListDocument);
|
||||
}
|
||||
|
||||
return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
|
||||
}
|
||||
|
||||
|
||||
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
|
||||
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
|
||||
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
|
||||
|
||||
for (Node node : nodes) {
|
||||
List<Node> rows = selectNodes("//TD[@class='a1']", dom);
|
||||
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
|
||||
for (Node row : rows) {
|
||||
try {
|
||||
String lang = getTextContent(getChildren("SPAN", node).get(0));
|
||||
List<Node> fields = selectNodes(".//SPAN", row);
|
||||
String language = getTextContent(fields.get(0));
|
||||
|
||||
if (languageName == null || languageName.equalsIgnoreCase(lang)) {
|
||||
String name = getTextContent(getChildren("SPAN", node).get(1));
|
||||
String href = getAttribute("href", node);
|
||||
if (languageName == null || language.equalsIgnoreCase(languageName)) {
|
||||
String name = getTextContent(fields.get(1));
|
||||
String href = selectString(".//A/@href", row);
|
||||
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
|
||||
|
||||
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
|
||||
subtitles.add(new SubsceneSubtitleDescriptor(name, language, subtitlePage));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
|
||||
@ -142,55 +114,63 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
||||
URLConnection connection = subtitleListUrl.openConnection();
|
||||
|
||||
if (languageFilter != null) {
|
||||
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + languageFilter);
|
||||
connection.addRequestProperty("Cookie", "Filter=" + languageFilter);
|
||||
}
|
||||
|
||||
return getHtmlDocument(connection);
|
||||
}
|
||||
|
||||
|
||||
protected String getLanguageFilter(String languageName) {
|
||||
protected String getLanguageFilter(String languageName) throws IOException, SAXException {
|
||||
if (languageName == null || languageName.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// try cache first
|
||||
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
|
||||
String cacheKey = getClass().getName() + ".languageFilter";
|
||||
|
||||
Element element = cache.get(cacheKey);
|
||||
if (element == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
|
||||
}
|
||||
|
||||
|
||||
protected Map<String, String> updateLanguageFilterMap(Document subtitleListDocument) {
|
||||
Map<String, String> filters = new HashMap<String, String>(50);
|
||||
List<Node> nodes = selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
|
||||
|
||||
for (Node node : nodes) {
|
||||
// select INPUT/@onclick, then ditch non-number-characters
|
||||
String filter = getAttribute("onclick", getChild("INPUT", node)).replaceAll("\\D+", "");
|
||||
|
||||
if (filter != null) {
|
||||
// select LABEL/text()
|
||||
String name = getTextContent("LABEL", node);
|
||||
|
||||
filters.put(name.toLowerCase(), filter);
|
||||
try {
|
||||
Element element = cache.get(cacheKey);
|
||||
if (element != null) {
|
||||
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
|
||||
}
|
||||
|
||||
// fetch new language filter data
|
||||
Map<String, String> filters = getLanguageFilterMap();
|
||||
|
||||
// update cache after sanity check
|
||||
if (filters.size() > 42) {
|
||||
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
|
||||
String cacheKey = getClass().getName() + ".languageFilter";
|
||||
cache.put(new Element(cacheKey, filters));
|
||||
try {
|
||||
cache.put(new Element(cacheKey, filters));
|
||||
} catch (Exception e) {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
|
||||
}
|
||||
} else {
|
||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters);
|
||||
}
|
||||
|
||||
return filters.get(languageName.toLowerCase());
|
||||
}
|
||||
|
||||
|
||||
protected Map<String, String> getLanguageFilterMap() throws IOException, SAXException {
|
||||
Map<String, String> filters = new HashMap<String, String>(50);
|
||||
|
||||
Document dom = getHtmlDocument(new URL("http://subscene.com/filter"));
|
||||
List<Node> checkboxes = selectNodes("//INPUT[@type='checkbox']", dom);
|
||||
|
||||
for (Node checkbox : checkboxes) {
|
||||
String filter = getAttribute("value", checkbox);
|
||||
if (filter != null) {
|
||||
String name = selectString("./following::LABEL", checkbox);
|
||||
filters.put(name.toLowerCase(), filter);
|
||||
}
|
||||
}
|
||||
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
@ -12,8 +12,7 @@ import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
import net.sourceforge.tuned.FileUtilities;
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
|
||||
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
|
||||
@ -52,42 +51,36 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
|
||||
|
||||
@Override
|
||||
public ByteBuffer fetch() throws Exception {
|
||||
// e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx
|
||||
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
|
||||
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
|
||||
URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download");
|
||||
|
||||
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path);
|
||||
Map<String, String> downloadPostData = subtitleInfo;
|
||||
|
||||
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
|
||||
HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection();
|
||||
connection.addRequestProperty("Referer", subtitlePage.toString());
|
||||
|
||||
return WebRequest.post(connection, downloadPostData);
|
||||
return WebRequest.post(connection, getSubtitleInfo());
|
||||
}
|
||||
|
||||
|
||||
private synchronized Map<String, String> getSubtitleInfo() {
|
||||
// extract subtitle information from subtitle page if necessary
|
||||
if (subtitleInfo == null) {
|
||||
subtitleInfo = new HashMap<String, String>();
|
||||
try {
|
||||
Document dom = getHtmlDocument(subtitlePage);
|
||||
|
||||
subtitleInfo = new HashMap<String, String>();
|
||||
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom));
|
||||
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
|
||||
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
|
||||
for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) {
|
||||
subtitleInfo.put(getAttribute("name", input), getAttribute("value", input));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
throw new RuntimeException("Failed to extract subtitle info", e);
|
||||
}
|
||||
}
|
||||
|
||||
return subtitleInfo;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getPath() {
|
||||
return String.format("%s.%s", getName(), subtitleInfo == null ? null : subtitleInfo.get("typeId"));
|
||||
return getName();
|
||||
}
|
||||
|
||||
|
||||
|
@ -30,8 +30,8 @@ public class SubsceneSubtitleClientTest {
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/Twin-Peaks-First-Season/subtitles-32482.aspx"));
|
||||
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"));
|
||||
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/subtitles/twin-peaks-first-season"));
|
||||
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/subtitles/lost-fourth-season"));
|
||||
}
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ public class SubsceneSubtitleClientTest {
|
||||
public void search() throws Exception {
|
||||
List<SearchResult> results = subscene.search("twin peaks");
|
||||
|
||||
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1);
|
||||
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
|
||||
assertEquals(twinpeaksSearchResult.toString(), result.toString());
|
||||
assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString());
|
||||
assertEquals(twinpeaksSearchResult.getName(), result.getName());
|
||||
@ -50,14 +50,14 @@ public class SubsceneSubtitleClientTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void searchResultPageRedirect() throws Exception {
|
||||
List<SearchResult> results = subscene.search("firefly");
|
||||
assertEquals(2, results.size());
|
||||
public void search2() throws Exception {
|
||||
List<SearchResult> results = subscene.search("Avatar 2009");
|
||||
assertEquals(3, results.size());
|
||||
|
||||
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
|
||||
assertEquals("Firefly - The Complete Series (2002)", result.toString());
|
||||
assertEquals("Firefly", result.getName());
|
||||
assertEquals("http://subscene.com/Firefly-The-Complete-Series/subtitles-20008.aspx", result.getURL().toString());
|
||||
assertEquals("http://subscene.com/subtitles/firefly-the-complete-series", result.getURL().toString());
|
||||
}
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ public class SubsceneSubtitleClientTest {
|
||||
assertEquals(10, subtitleList.size());
|
||||
|
||||
SubtitleDescriptor subtitle = subtitleList.get(0);
|
||||
assertEquals("Twin Peaks - First Season", subtitle.getName());
|
||||
assertEquals("Twin-Peaks-S01E00-Pilot-eAlternate-ita sub by IScrew [www.ITALIANSHARE.net]", subtitle.getName());
|
||||
assertEquals("Italian", subtitle.getLanguageName());
|
||||
}
|
||||
|
||||
@ -83,7 +83,7 @@ public class SubsceneSubtitleClientTest {
|
||||
|
||||
@Test
|
||||
public void getLanguageFilterMap() throws Exception {
|
||||
Map<String, String> filters = subscene.updateLanguageFilterMap(subscene.getSubtitleListDocument(new URL("http://subscene.com/none/subtitles-0.aspx"), null));
|
||||
Map<String, String> filters = subscene.getLanguageFilterMap();
|
||||
|
||||
assertEquals("1", filters.get("albanian"));
|
||||
assertEquals("13", filters.get("english"));
|
||||
@ -101,8 +101,8 @@ public class SubsceneSubtitleClientTest {
|
||||
@Test
|
||||
public void downloadSubtitleArchive() throws Exception {
|
||||
SearchResult selectedResult = subscene.search("firefly").get(0);
|
||||
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1);
|
||||
assertEquals("Firefly - The Complete Series", subtitleDescriptor.getName());
|
||||
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0);
|
||||
assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName());
|
||||
|
||||
ByteBuffer archive = subtitleDescriptor.fetch();
|
||||
assertEquals(254549, archive.remaining());
|
||||
|
Loading…
Reference in New Issue
Block a user