* update website scraper to subscene v3

This commit is contained in:
Reinhard Pointner 2012-07-16 10:06:40 +00:00
parent bbe5e27339
commit 8fa867ae49
3 changed files with 69 additions and 96 deletions

View File

@ -55,10 +55,10 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
@Override
public List<SearchResult> search(String query) throws IOException, SAXException {
URL searchUrl = new URL("http", host, "/filmsearch.aspx?q=" + encode(query));
URL searchUrl = new URL("http", host, "/subtitles/title.aspx?q=" + encode(query));
Document dom = getHtmlDocument(searchUrl);
List<Node> nodes = selectNodes("id('filmSearch')/A", dom);
List<Node> nodes = selectNodes("//H2[text()='Close']//following::DIV[@class='title']//A", dom);
List<SearchResult> searchResults = new ArrayList<SearchResult>(nodes.size());
Pattern titleSuffixPattern = Pattern.compile("\\s-\\s([^-]+)[(](\\d{4})[)]$");
@ -77,23 +77,6 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
}
}
// we might have been redirected to the subtitle list
if (searchResults.isEmpty()) {
try {
// get name of current search result
String name = selectString("id('leftWrapperWide')//H1/text()", dom);
// get current location
String file = selectString("id('aspnetForm')/@action", dom);
if (!name.isEmpty() && !file.isEmpty()) {
searchResults.add(new HyperLink(name, new URL("http", host, file)));
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle page: " + searchUrl, e);
}
}
return searchResults;
}
@ -102,32 +85,21 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, String languageName) throws Exception {
URL subtitleListUrl = getSubtitleListLink(searchResult, languageName).toURL();
String languageFilter = getLanguageFilter(languageName);
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
String filter = getLanguageFilter(languageName);
Document dom = getSubtitleListDocument(subtitleListUrl, filter);
// let's update language filters if they are not known yet
if (languageName != null && languageFilter == null) {
updateLanguageFilterMap(subtitleListDocument);
}
return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
}
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
List<Node> nodes = selectNodes("//TABLE[@class='filmSubtitleList']//A[@class='a1']", subtitleListDocument);
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) {
List<Node> rows = selectNodes("//TD[@class='a1']", dom);
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>();
for (Node row : rows) {
try {
String lang = getTextContent(getChildren("SPAN", node).get(0));
List<Node> fields = selectNodes(".//SPAN", row);
String language = getTextContent(fields.get(0));
if (languageName == null || languageName.equalsIgnoreCase(lang)) {
String name = getTextContent(getChildren("SPAN", node).get(1));
String href = getAttribute("href", node);
if (languageName == null || language.equalsIgnoreCase(languageName)) {
String name = getTextContent(fields.get(1));
String href = selectString(".//A/@href", row);
URL subtitlePage = new URL(subtitleListUrl.getProtocol(), subtitleListUrl.getHost(), href);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, subtitlePage));
subtitles.add(new SubsceneSubtitleDescriptor(name, language, subtitlePage));
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Cannot parse subtitle node", e);
@ -142,55 +114,63 @@ public class SubsceneSubtitleClient implements SubtitleProvider {
URLConnection connection = subtitleListUrl.openConnection();
if (languageFilter != null) {
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + languageFilter);
connection.addRequestProperty("Cookie", "Filter=" + languageFilter);
}
return getHtmlDocument(connection);
}
protected String getLanguageFilter(String languageName) {
protected String getLanguageFilter(String languageName) throws IOException, SAXException {
if (languageName == null || languageName.isEmpty()) {
return null;
}
// try cache first
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
String cacheKey = getClass().getName() + ".languageFilter";
Element element = cache.get(cacheKey);
if (element == null) {
return null;
}
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
}
protected Map<String, String> updateLanguageFilterMap(Document subtitleListDocument) {
Map<String, String> filters = new HashMap<String, String>(50);
List<Node> nodes = selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
for (Node node : nodes) {
// select INPUT/@onclick, then ditch non-number-characters
String filter = getAttribute("onclick", getChild("INPUT", node)).replaceAll("\\D+", "");
if (filter != null) {
// select LABEL/text()
String name = getTextContent("LABEL", node);
filters.put(name.toLowerCase(), filter);
try {
Element element = cache.get(cacheKey);
if (element != null) {
return (String) ((Map<?, ?>) element.getValue()).get(languageName.toLowerCase());
}
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
}
// fetch new language filter data
Map<String, String> filters = getLanguageFilterMap();
// update cache after sanity check
if (filters.size() > 42) {
Cache cache = CacheManager.getInstance().getCache("web-persistent-datasource");
String cacheKey = getClass().getName() + ".languageFilter";
cache.put(new Element(cacheKey, filters));
try {
cache.put(new Element(cacheKey, filters));
} catch (Exception e) {
Logger.getLogger(getClass().getName()).log(Level.WARNING, e.getMessage());
}
} else {
Logger.getLogger(getClass().getName()).log(Level.WARNING, "Failed to scrape language filters: " + filters);
}
return filters.get(languageName.toLowerCase());
}
protected Map<String, String> getLanguageFilterMap() throws IOException, SAXException {
Map<String, String> filters = new HashMap<String, String>(50);
Document dom = getHtmlDocument(new URL("http://subscene.com/filter"));
List<Node> checkboxes = selectNodes("//INPUT[@type='checkbox']", dom);
for (Node checkbox : checkboxes) {
String filter = getAttribute("value", checkbox);
if (filter != null) {
String name = selectString("./following::LABEL", checkbox);
filters.put(name.toLowerCase(), filter);
}
}
return filters;
}

View File

@ -12,8 +12,7 @@ import java.util.HashMap;
import java.util.Map;
import org.w3c.dom.Document;
import net.sourceforge.tuned.FileUtilities;
import org.w3c.dom.Node;
public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@ -52,42 +51,36 @@ public class SubsceneSubtitleDescriptor implements SubtitleDescriptor {
@Override
public ByteBuffer fetch() throws Exception {
// e.g. http://subscene.com/english/Firefly-The-Complete-Series/subtitle-40003-dlpath-20008/rar.zipx
String subtitlePagePath = FileUtilities.getNameWithoutExtension(subtitlePage.getFile());
String path = String.format("%s-dlpath-%s/%s.zipx", subtitlePagePath, getSubtitleInfo().get("filmId"), getSubtitleInfo().get("typeId"));
URL downloadLink = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), "/subtitle/download");
URL downloadLocator = new URL(subtitlePage.getProtocol(), subtitlePage.getHost(), path);
Map<String, String> downloadPostData = subtitleInfo;
HttpURLConnection connection = (HttpURLConnection) downloadLocator.openConnection();
HttpURLConnection connection = (HttpURLConnection) downloadLink.openConnection();
connection.addRequestProperty("Referer", subtitlePage.toString());
return WebRequest.post(connection, downloadPostData);
return WebRequest.post(connection, getSubtitleInfo());
}
private synchronized Map<String, String> getSubtitleInfo() {
// extract subtitle information from subtitle page if necessary
if (subtitleInfo == null) {
subtitleInfo = new HashMap<String, String>();
try {
Document dom = getHtmlDocument(subtitlePage);
subtitleInfo = new HashMap<String, String>();
subtitleInfo.put("subtitleId", selectString("//INPUT[@name='subtitleId']/@value", dom));
subtitleInfo.put("typeId", selectString("//INPUT[@name='typeId']/@value", dom));
subtitleInfo.put("filmId", selectString("//INPUT[@name='filmId']/@value", dom));
for (Node input : selectNodes("id('dl')//INPUT[@name]", dom)) {
subtitleInfo.put(getAttribute("name", input), getAttribute("value", input));
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Failed to extract subtitle info", e);
}
}
return subtitleInfo;
}
@Override
public String getPath() {
return String.format("%s.%s", getName(), subtitleInfo == null ? null : subtitleInfo.get("typeId"));
return getName();
}

View File

@ -30,8 +30,8 @@ public class SubsceneSubtitleClientTest {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/Twin-Peaks-First-Season/subtitles-32482.aspx"));
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"));
twinpeaksSearchResult = new SubsceneSearchResult("Twin Peaks", "Twin Peaks - First Season (1990)", new URL("http://subscene.com/subtitles/twin-peaks-first-season"));
lostSearchResult = new SubsceneSearchResult("Lost", "Lost - Fourth Season (2008)", new URL("http://subscene.com/subtitles/lost-fourth-season"));
}
@ -42,7 +42,7 @@ public class SubsceneSubtitleClientTest {
public void search() throws Exception {
List<SearchResult> results = subscene.search("twin peaks");
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1);
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
assertEquals(twinpeaksSearchResult.toString(), result.toString());
assertEquals(twinpeaksSearchResult.getURL().toString(), result.getURL().toString());
assertEquals(twinpeaksSearchResult.getName(), result.getName());
@ -50,14 +50,14 @@ public class SubsceneSubtitleClientTest {
@Test
public void searchResultPageRedirect() throws Exception {
List<SearchResult> results = subscene.search("firefly");
assertEquals(2, results.size());
public void search2() throws Exception {
List<SearchResult> results = subscene.search("Avatar 2009");
assertEquals(3, results.size());
SubsceneSearchResult result = (SubsceneSearchResult) results.get(0);
assertEquals("Firefly - The Complete Series (2002)", result.toString());
assertEquals("Firefly", result.getName());
assertEquals("http://subscene.com/Firefly-The-Complete-Series/subtitles-20008.aspx", result.getURL().toString());
assertEquals("http://subscene.com/subtitles/firefly-the-complete-series", result.getURL().toString());
}
@ -67,7 +67,7 @@ public class SubsceneSubtitleClientTest {
assertEquals(10, subtitleList.size());
SubtitleDescriptor subtitle = subtitleList.get(0);
assertEquals("Twin Peaks - First Season", subtitle.getName());
assertEquals("Twin-Peaks-S01E00-Pilot-eAlternate-ita sub by IScrew [www.ITALIANSHARE.net]", subtitle.getName());
assertEquals("Italian", subtitle.getLanguageName());
}
@ -83,7 +83,7 @@ public class SubsceneSubtitleClientTest {
@Test
public void getLanguageFilterMap() throws Exception {
Map<String, String> filters = subscene.updateLanguageFilterMap(subscene.getSubtitleListDocument(new URL("http://subscene.com/none/subtitles-0.aspx"), null));
Map<String, String> filters = subscene.getLanguageFilterMap();
assertEquals("1", filters.get("albanian"));
assertEquals("13", filters.get("english"));
@ -101,8 +101,8 @@ public class SubsceneSubtitleClientTest {
@Test
public void downloadSubtitleArchive() throws Exception {
SearchResult selectedResult = subscene.search("firefly").get(0);
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(1);
assertEquals("Firefly - The Complete Series", subtitleDescriptor.getName());
SubtitleDescriptor subtitleDescriptor = subscene.getSubtitleList(selectedResult, "English").get(0);
assertEquals("Firefly.S01E00-13.DVDRip-Rogue.eng-RETAIL", subtitleDescriptor.getName());
ByteBuffer archive = subtitleDescriptor.fetch();
assertEquals(254549, archive.remaining());