mirror of
https://github.com/mitb-archive/filebot
synced 2024-08-13 17:03:45 -04:00
* update website scraper to subscene v3
This commit is contained in:
parent
bbe5e27339
commit
8fa867ae49
@ -55,10 +55,10 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<SearchResult> search(String query) throws IOException, SAXException {
|
public List<SearchResult> search(String query) throws IOException, SAXException {
|
||||||
URL searchUrl = new URL("http", host, "/filmsearch.aspx?q=" + encode(query));
|
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
|
||||||
Document dom = getHtmlDocument(searchUrl);
|
Document dom = getHtmlDocument(searchUrl);
|
||||||
|
|
||||||
List<Node> nodes = selectNodes("id('filmSearch')/A", dom);
|
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);
|
||||||
List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size());
|
List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size());
|
||||||
|
|
||||||
Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$");
|
Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$");
|
||||||
@ -77,23 +77,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// we might have been redirected to the subtitle list
|
|
||||||
if (searchResults.isEmpty()) {
|
|
||||||
try {
|
|
||||||
// get name of current search result
|
|
||||||
String name = selectString("id('leftWrapperWide')//H1/text()", dom);
|
|
||||||
|
|
||||||
// get current location
|
|
||||||
String file = selectString("id('aspnetForm')/@action", dom);
|
|
||||||
|
|
||||||
if (!name.isEmpty() && !file.isEmpty()) {
|
|
||||||
searchResults.add(new HyperLink(name, new URL("http", host, file)));
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle page: " + searchUrl, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return searchResults;
|
return searchResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,32 +85,21 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
|||||||
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception {
|
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception {
|
||||||
URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL();
|
URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL();
|
||||||
|
|
||||||
String languageFilter = getLanguageFilter(languageName);
|
String filter = getLanguageFilter(languageName);
|
||||||
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
|
Document dom = getSubtitleListDocument(subtitleListUrl, filter);
|
||||||
|
|
||||||
// let's update language filters if they are not known yet
|
List<Node> rows = selectNodes("//TD[@class='a1']", dom);
|
||||||
if (languageName != null && languageFilter == null) {
|
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
|
||||||
updateLanguageFilterMap(subtitleListDocument);
|
for (Node row : rows) {
|
||||||
}
|
|
||||||
|
|
||||||
return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
|
|
||||||
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
|
|
||||||
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
|
|
||||||
|
|
||||||
for (Node node : nodes) {
|
|
||||||
try {
|
try {
|
||||||
String lang = getTextContent(getChildren("SPAN", node).get(0));
|
List<Node> fields = selectNodes(".//SPAN", row);
|
||||||
|
String language = getTextContent(fields.get(0));
|
||||||
|
|
||||||
if (languageName == null || languageName.equalsIgnoreCase(lang)) {
|
if (languageName == null || language.equalsIgnoreCase(languageName)) {
|
||||||
String name = getTextContent(getChildren("SPAN", node).get(1));
|
String name = getTextContent(fields.get(1));
|
||||||
String href = getAttribute("href", node);
|
String href = selectString(".//A/@href", row);
|
||||||
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
|
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
|
||||||
|
subtitles.add(new SubsceneSubtitleDescriptor(name, language, subtitlePage));
|
||||||
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
|
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
|
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
|
||||||
@ -142,55 +114,63 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
|
|||||||
URLConnection connection = subtitleListUrl.openConnection();
|
URLConnection connection = subtitleListUrl.openConnection();
|
||||||
|
|
||||||
if (languageFilter != null) {
|
if (languageFilter != null) {
|
||||||
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + languageFilter);
|
connection.addRequestProperty("Cookie", "Filter=" + languageFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
return getHtmlDocument(connection);
|
return getHtmlDocument(connection);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected String getLanguageFilter(String languageName) {
|
protected String getLanguageFilter(String languageName) throws IOException, SAXException {
|
||||||
if (languageName == null || languageName.isEmpty()) {
|
if (languageName == null || languageName.isEmpty()) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// try cache first
|
||||||
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
|
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
|
||||||
String cacheKey = getClass().getName() + ".languageFilter";
|
String cacheKey = getClass().getName() + ".languageFilter";
|
||||||
|
|
||||||
|
try {
|
||||||
Element element = cache.get(cacheKey);
|
Element element = cache.get(cacheKey);
|
||||||
if (element == null) {
|
if (element != null) {
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
|
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
|
||||||
}
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
|
||||||
protected Map<String, String> updateLanguageFilterMap(Document subtitleListDocument) {
|
|
||||||
Map<String, String> filters = new HashMap<String, String>(50);
|
|
||||||
List<Node> nodes = selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
|
|
||||||
|
|
||||||
for (Node node : nodes) {
|
|
||||||
// select INPUT/@onclick, then ditch non-number-characters
|
|
||||||
String filter = getAttribute("onclick", getChild("INPUT", node)).replaceAll("\\D+", "");
|
|
||||||
|
|
||||||
if (filter != null) {
|
|
||||||
// select LABEL/text()
|
|
||||||
String name = getTextContent("LABEL", node);
|
|
||||||
|
|
||||||
filters.put(name.toLowerCase(), filter);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fetch new language filter data
|
||||||
|
Map<String, String> filters = getLanguageFilterMap();
|
||||||
|
|
||||||
// update cache after sanity check
|
// update cache after sanity check
|
||||||
if (filters.size() > 42) {
|
if (filters.size() > 42) {
|
||||||
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
|
try {
|
||||||
String cacheKey = getClass().getName() + ".languageFilter";
|
|
||||||
cache.put(new Element(cacheKey, filters));
|
cache.put(new Element(cacheKey, filters));
|
||||||
|
} catch (Exception e) {
|
||||||
|
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters);
|
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return filters.get(languageName.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected Map<String, String> getLanguageFilterMap() throws IOException, SAXException {
|
||||||
|
Map<String, String> filters = new HashMap<String, String>(50);
|
||||||
|
|
||||||
|
Document dom = getHtmlDocument(new URL("http://subscene.com/filter"));
|
||||||
|
List<Node> checkboxes = selectNodes("//INPUT[@type='checkbox']", dom);
|
||||||
|
|
||||||
|
for (Node checkbox : checkboxes) {
|
||||||
|
String filter = getAttribute("value", checkbox);
|
||||||
|
if (filter != null) {
|
||||||
|
String name = selectString("./following::LABEL", checkbox);
|
||||||
|
filters.put(name.toLowerCase(), filter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,8 +12,7 @@ import java.util.HashMap;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.w3c.dom.Document;
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
import net.sourceforge.tuned.FileUtilities;
|
|
||||||
|
|
||||||
|
|
||||||
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
|
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
|
||||||
@ -52,42 +51,36 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ByteBuffer fetch() throws Exception {
|
public ByteBuffer fetch() throws Exception {
|
||||||
// e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx
|
URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download");
|
||||||
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
|
|
||||||
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
|
|
||||||
|
|
||||||
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path);
|
HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection();
|
||||||
Map<String, String> downloadPostData = subtitleInfo;
|
|
||||||
|
|
||||||
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
|
|
||||||
connection.addRequestProperty("Referer", subtitlePage.toString());
|
connection.addRequestProperty("Referer", subtitlePage.toString());
|
||||||
|
|
||||||
return WebRequest.post(connection, downloadPostData);
|
return WebRequest.post(connection, getSubtitleInfo());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private synchronized Map<String, String> getSubtitleInfo() {
|
private synchronized Map<String, String> getSubtitleInfo() {
|
||||||
// extract subtitle information from subtitle page if necessary
|
// extract subtitle information from subtitle page if necessary
|
||||||
if (subtitleInfo == null) {
|
if (subtitleInfo == null) {
|
||||||
|
subtitleInfo = new HashMap<String, String>();
|
||||||
try {
|
try {
|
||||||
Document dom = getHtmlDocument(subtitlePage);
|
Document dom = getHtmlDocument(subtitlePage);
|
||||||
|
for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) {
|
||||||
subtitleInfo = new HashMap<String, String>();
|
subtitleInfo.put(getAttribute("name", input), getAttribute("value", input));
|
||||||
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom));
|
}
|
||||||
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
|
|
||||||
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
throw new RuntimeException("Failed to extract subtitle info", e);
|
throw new RuntimeException("Failed to extract subtitle info", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return subtitleInfo;
|
return subtitleInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getPath() {
|
public String getPath() {
|
||||||
return String.format("%s.%s", getName(), subtitleInfo == null ? null : subtitleInfo.get("typeId"));
|
return getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,8 +30,8 @@ public class SubsceneSubtitleClientTest {
|
|||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void setUpBeforeClass() throws Exception {
|
public static void setUpBeforeClass() throws Exception {
|
||||||
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/Twin-Peaks-First-Season/subtitles-32482.aspx"));
|
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/subtitles/twin-peaks-first-season"));
|
||||||
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"));
|
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/subtitles/lost-fourth-season"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ public class SubsceneSubtitleClientTest {
|
|||||||
public void search() throws Exception {
|
public void search() throws Exception {
|
||||||
List<SearchResult> results = subscene.search("twin peaks");
|
List<SearchResult> results = subscene.search("twin peaks");
|
||||||
|
|
||||||
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1);
|
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
|
||||||
assertEquals(twinpeaksSearchResult.toString(), result.toString());
|
assertEquals(twinpeaksSearchResult.toString(), result.toString());
|
||||||
assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString());
|
assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString());
|
||||||
assertEquals(twinpeaksSearchResult.getName(), result.getName());
|
assertEquals(twinpeaksSearchResult.getName(), result.getName());
|
||||||
@ -50,14 +50,14 @@ public class SubsceneSubtitleClientTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void searchResultPageRedirect() throws Exception {
|
public void search2() throws Exception {
|
||||||
List<SearchResult> results = subscene.search("firefly");
|
List<SearchResult> results = subscene.search("Avatar 2009");
|
||||||
assertEquals(2, results.size());
|
assertEquals(3, results.size());
|
||||||
|
|
||||||
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
|
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
|
||||||
assertEquals("Firefly - The Complete Series (2002)", result.toString());
|
assertEquals("Firefly - The Complete Series (2002)", result.toString());
|
||||||
assertEquals("Firefly", result.getName());
|
assertEquals("Firefly", result.getName());
|
||||||
assertEquals("http://subscene.com/Firefly-The-Complete-Series/subtitles-20008.aspx", result.getURL().toString());
|
assertEquals("http://subscene.com/subtitles/firefly-the-complete-series", result.getURL().toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ public class SubsceneSubtitleClientTest {
|
|||||||
assertEquals(10, subtitleList.size());
|
assertEquals(10, subtitleList.size());
|
||||||
|
|
||||||
SubtitleDescriptor subtitle = subtitleList.get(0);
|
SubtitleDescriptor subtitle = subtitleList.get(0);
|
||||||
assertEquals("Twin Peaks - First Season", subtitle.getName());
|
assertEquals("Twin-Peaks-S01E00-Pilot-eAlternate-ita sub by IScrew [www.ITALIANSHARE.net]", subtitle.getName());
|
||||||
assertEquals("Italian", subtitle.getLanguageName());
|
assertEquals("Italian", subtitle.getLanguageName());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ public class SubsceneSubtitleClientTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void getLanguageFilterMap() throws Exception {
|
public void getLanguageFilterMap() throws Exception {
|
||||||
Map<String, String> filters = subscene.updateLanguageFilterMap(subscene.getSubtitleListDocument(new URL("http://subscene.com/none/subtitles-0.aspx"), null));
|
Map<String, String> filters = subscene.getLanguageFilterMap();
|
||||||
|
|
||||||
assertEquals("1", filters.get("albanian"));
|
assertEquals("1", filters.get("albanian"));
|
||||||
assertEquals("13", filters.get("english"));
|
assertEquals("13", filters.get("english"));
|
||||||
@ -101,8 +101,8 @@ public class SubsceneSubtitleClientTest {
|
|||||||
@Test
|
@Test
|
||||||
public void downloadSubtitleArchive() throws Exception {
|
public void downloadSubtitleArchive() throws Exception {
|
||||||
SearchResult selectedResult = subscene.search("firefly").get(0);
|
SearchResult selectedResult = subscene.search("firefly").get(0);
|
||||||
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1);
|
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0);
|
||||||
assertEquals("Firefly - The Complete Series", subtitleDescriptor.getName());
|
assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName());
|
||||||
|
|
||||||
ByteBuffer archive = subtitleDescriptor.fetch();
|
ByteBuffer archive = subtitleDescriptor.fetch();
|
||||||
assertEquals(254549, archive.remaining());
|
assertEquals(254549, archive.remaining());
|
||||||
|
Loading…
Reference in New Issue
Block a user