* heavily improved Subscene support (up to 35x faster)

This commit is contained in:
Reinhard Pointner 2008-07-06 18:31:04 +00:00
parent a94cedd601
commit 9eb74e8038
8 changed files with 248 additions and 89 deletions

View File

@ -6,7 +6,6 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.NumberFormat;
@ -60,10 +59,8 @@ public class AnidbClient extends EpisodeListClient {
String path = "/perl-bin/" + href;
try {
URI animeUrl = new URI("http", host, path, null);
searchResults.add(new HyperLink(title, animeUrl));
} catch (URISyntaxException e) {
searchResults.add(new HyperLink(title, new URL("http", host, path)));
} catch (MalformedURLException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href);
}
}
@ -76,7 +73,7 @@ public class AnidbClient extends EpisodeListClient {
String header = XPathUtil.selectString("id('layout-content')//H1[1]", dom);
String title = header.replaceFirst("Anime:\\s*", "");
searchResults.add(new HyperLink(title, URI.create(getSearchUrl(searchterm).toString())));
searchResults.add(new HyperLink(title, getSearchUrl(searchterm)));
}
}
@ -123,7 +120,7 @@ public class AnidbClient extends EpisodeListClient {
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).getURI();
return ((HyperLink) searchResult).toURI();
}

View File

@ -10,6 +10,7 @@ import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
@ -56,6 +57,17 @@ public class HtmlUtil {
}
public static Document getHtmlDocument(URL url, Map<String, String> requestHeaders) throws IOException, SAXException {
URLConnection connection = url.openConnection();
for (String key : requestHeaders.keySet()) {
connection.addRequestProperty(key, requestHeaders.get(key));
}
return getHtmlDocument(connection);
}
public static Document getHtmlDocument(URLConnection connection) throws IOException, SAXException {
Charset charset = getCharset(connection.getContentType());
String encoding = connection.getContentEncoding();

View File

@ -4,26 +4,31 @@ package net.sourceforge.filebot.web;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
public class HyperLink extends SearchResult {
private final URI uri;
private final URL url;
public HyperLink(String name, URI uri) {
public HyperLink(String name, URL url) {
super(name);
this.uri = uri;
this.url = url;
}
public HyperLink(String name, String uri) throws URISyntaxException {
this(name, new URI(uri));
public URL getURL() {
return url;
}
public URI getURI() {
return uri;
public URI toURI() {
try {
return url.toURI();
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -6,15 +6,16 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
@ -33,6 +34,8 @@ public class SubsceneSubtitleClient extends SubtitleClient {
private final SearchResultCache cache = new SearchResultCache();
private final Map<String, Integer> languageFilterMap = new ConcurrentHashMap<String, Integer>(50);
private final String host = "subscene.com";
@ -56,13 +59,14 @@ public class SubsceneSubtitleClient extends SubtitleClient {
for (Node node : nodes) {
String title = XPathUtil.selectString("text()", node);
String href = XPathUtil.selectString("@href", node);
String count = XPathUtil.selectString("./DFN", node).replaceAll("\\D+", "");
try {
//TODO which exception?
URI url = new URI("http", host, href);
URL subtitleListUrl = new URL("http", host, href);
int subtitleCount = Integer.parseInt(count);
searchResults.add(new HyperLink(title, url));
} catch (URISyntaxException e) {
searchResults.add(new SubsceneSearchResult(title, subtitleListUrl, subtitleCount));
} catch (MalformedURLException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e);
}
}
@ -72,85 +76,127 @@ public class SubsceneSubtitleClient extends SubtitleClient {
return searchResults;
}
HashMap<String, String> languageIdCache;
private void updateLanguageFilterMap(Document subtitleListDocument) {
List<Node> nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", subtitleListDocument);
for (Node node : nodes) {
String onClick = XPathUtil.selectString("./INPUT/@onclick", node);
String filter = new Scanner(onClick).findInLine("\\d+");
if (filter != null) {
String name = XPathUtil.selectString("./LABEL/text()", node);
languageFilterMap.put(name.toLowerCase(), Integer.valueOf(filter));
}
}
}
private Integer getLanguageFilter(String languageName) {
if (languageName == null)
return null;
return languageFilterMap.get(languageName.toLowerCase());
}
public String getLanguageID(Locale language) {
return languageIdCache.get(language.getDisplayLanguage(Locale.ENGLISH).toLowerCase());
private String getLanguageName(Locale language) {
if (language == null || language == Locale.ROOT)
return null;
return language.getDisplayLanguage(Locale.ENGLISH);
}
@Override
public List<SubtitleDescriptor> getSubtitleList(SearchResult searchResult, Locale language) throws Exception {
URL url = getSubtitleListLink(searchResult).toURL();
URL subtitleListUrl = getSubtitleListLink(searchResult).toURL();
String languageName = getLanguageName(language);
Integer languageFilter = getLanguageFilter(languageName);
Document dom = null;
boolean reloadFilteredDocument = (languageFilter == null && useFilteredDocument(searchResult));
boolean forceReload = false;
if (languageIdCache != null) {
URLConnection connection = url.openConnection();
if (reloadFilteredDocument && languageFilterMap.isEmpty()) {
// we don't know the filter values yet, so we request a document with an invalid filter,
// that will return a subtitle document very fast
languageFilter = -1;
forceReload = true;
}
Document subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
if (languageFilterMap.isEmpty()) {
updateLanguageFilterMap(subtitleListDocument);
}
// check if document is already filtered and if requesting a filtered document
// will result in a performance gain (Note: XPath can be very slow)
if (reloadFilteredDocument) {
languageFilter = getLanguageFilter(languageName);
if (language != null && language != Locale.ROOT) {
System.out.println(getLanguageID(language));
connection.addRequestProperty("Cookie", "subscene_sLanguageIds=" + getLanguageID(language));
}
dom = HtmlUtil.getHtmlDocument(connection);
} else {
URLConnection connection = url.openConnection();
dom = HtmlUtil.getHtmlDocument(connection);
List<Node> nodes = XPathUtil.selectNodes("//DIV[@class='languageList']/DIV", dom);
Pattern onClickPattern = Pattern.compile("selectLanguage\\((\\d+)\\);");
languageIdCache = new HashMap<String, String>();
for (Node node : nodes) {
Matcher matcher = onClickPattern.matcher(XPathUtil.selectString("./INPUT/@onclick", node));
if (matcher.matches()) {
String name = XPathUtil.selectString("./LABEL/text()", node);
String id = matcher.group(1);
//TODO sysout
System.out.println(name + " = " + id);
languageIdCache.put(name.toLowerCase(), id);
}
// if language filter has become available, request a filtered document, or if first request was a dummy request
if (languageFilter != null || forceReload) {
subtitleListDocument = getSubtitleListDocument(subtitleListUrl, languageFilter);
}
}
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", dom);
return getSubtitleList(subtitleListUrl, languageName, subtitleListDocument);
}
private boolean useFilteredDocument(SearchResult searchResult) {
SubsceneSearchResult sr = (SubsceneSearchResult) searchResult;
return sr.getSubtitleCount() > 100;
}
private Document getSubtitleListDocument(URL subtitleListUrl, Integer languageFilter) throws IOException, SAXException {
Map<String, String> requestHeaders = new HashMap<String, String>(1);
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', '\\d+', '(\\d+)'\\);");
if (languageFilter != null) {
requestHeaders.put("Cookie", "subscene_sLanguageIds=" + languageFilter);
}
ArrayList<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
return HtmlUtil.getHtmlDocument(subtitleListUrl, requestHeaders);
}
private List<SubtitleDescriptor> getSubtitleList(URL subtitleListUrl, String languageName, Document subtitleListDocument) {
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='filmSubtitleList']//A[@id]//ancestor::TR", subtitleListDocument);
Pattern hrefPattern = Pattern.compile("javascript:Subtitle\\((\\d+), '(\\w+)', .*");
List<SubtitleDescriptor> subtitles = new ArrayList<SubtitleDescriptor>(nodes.size());
for (Node node : nodes) {
try {
Node linkNode = XPathUtil.selectFirstNode("./TD[1]/A", node);
String lang = XPathUtil.selectString("./SPAN[1]", linkNode);
String href = XPathUtil.selectString("@href", linkNode);
String name = XPathUtil.selectString("./SPAN[2]", linkNode);
String author = XPathUtil.selectString("./TD[4]", node);
Matcher matcher = hrefPattern.matcher(href);
if (!matcher.matches())
throw new IllegalArgumentException("Cannot extract download parameters: " + href);
String subtitleId = matcher.group(1);
String typeId = matcher.group(2);
URL downloadUrl = getDownloadUrl(url, subtitleId, typeId);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, url));
if (languageName == null || languageName.equalsIgnoreCase(lang)) {
String href = XPathUtil.selectString("@href", linkNode);
String name = XPathUtil.selectString("./SPAN[2]", linkNode);
String author = XPathUtil.selectString("./TD[4]", node);
Matcher matcher = hrefPattern.matcher(href);
if (!matcher.matches())
throw new IllegalArgumentException("Cannot extract download parameters: " + href);
String subtitleId = matcher.group(1);
String typeId = matcher.group(2);
URL downloadUrl = getDownloadUrl(subtitleListUrl, subtitleId, typeId);
subtitles.add(new SubsceneSubtitleDescriptor(name, lang, author, typeId, downloadUrl, subtitleListUrl));
}
} catch (Exception e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Cannot parse subtitle node", e);
}
@ -170,7 +216,7 @@ public class SubsceneSubtitleClient extends SubtitleClient {
@Override
public URI getSubtitleListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).getURI();
return ((HyperLink) searchResult).toURI();
}
@ -180,4 +226,22 @@ public class SubsceneSubtitleClient extends SubtitleClient {
return new URL("http", host, file);
}
protected static class SubsceneSearchResult extends HyperLink {
private final int subtitleCount;
public SubsceneSearchResult(String name, URL url, int subtitleCount) {
super(name, url);
this.subtitleCount = subtitleCount;
}
public int getSubtitleCount() {
return subtitleCount;
}
}
}

View File

@ -6,7 +6,6 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.NumberFormat;
@ -65,10 +64,10 @@ public class TVDotComClient extends EpisodeListClient {
String href = XPathUtil.selectString("@href", node);
try {
String episodeListingUrl = href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html");
URL episodeListingUrl = new URL(href.replaceFirst(Pattern.quote("summary.html?") + ".*", "episode_listings.html"));
searchResults.add(new HyperLink(title, episodeListingUrl));
} catch (URISyntaxException e) {
} catch (MalformedURLException e) {
Logger.getLogger(Logger.GLOBAL_LOGGER_NAME).log(Level.WARNING, "Invalid href: " + href, e);
}
}
@ -169,7 +168,7 @@ public class TVDotComClient extends EpisodeListClient {
@Override
public URI getEpisodeListLink(SearchResult searchResult, int season) {
String episodeListingUrl = ((HyperLink) searchResult).getURI().toString();
URL episodeListingUrl = ((HyperLink) searchResult).getURL();
return URI.create(episodeListingUrl + "?season=" + season);
}

View File

@ -0,0 +1,72 @@
package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals;
import java.net.URL;
import java.util.List;
import java.util.Locale;
import net.sourceforge.filebot.ui.panel.subtitle.LanguageResolver;
import net.sourceforge.filebot.web.SubsceneSubtitleClient.SubsceneSearchResult;
import org.junit.BeforeClass;
import org.junit.Test;
public class SubsceneSubtitleClientTest {
private static SubsceneSearchResult testResult;
private static SubsceneSearchResult manySubtitlesTestResult;
private SubsceneSubtitleClient client = new SubsceneSubtitleClient();
@BeforeClass
public static void setUpBeforeClass() throws Exception {
testResult = new SubsceneSearchResult("Twin Peaks - First Season (1990)", new URL("http://subscene.com/twin-peaks--first-season/subtitles-32482.aspx"), 17);
manySubtitlesTestResult = new SubsceneSearchResult("Lost - Fourth Season (2008)", new URL("http://subscene.com/Lost-Fourth-Season/subtitles-70963.aspx"), 420);
}
@Test
public void search() throws Exception {
List<SearchResult> results = client.search("twin peaks");
SubsceneSearchResult result = (SubsceneSearchResult) results.get(1);
assertEquals(testResult.getName(), result.getName());
assertEquals(testResult.getURL().toString(), result.getURL().toString());
assertEquals(testResult.getSubtitleCount(), result.getSubtitleCount());
}
@Test
public void getSubtitleListSearchResult() throws Exception {
List<SubtitleDescriptor> subtitleList = client.getSubtitleList(testResult, Locale.ITALIAN);
assertEquals(1, subtitleList.size());
SubtitleDescriptor subtitle = subtitleList.get(0);
assertEquals("Twin Peaks - First Season", subtitle.getName());
assertEquals("Italian", subtitle.getLanguageName());
assertEquals("zip", subtitle.getArchiveType());
}
@Test
public void getSubtitleListSearchResultMany() throws Exception {
List<SubtitleDescriptor> subtitleList = client.getSubtitleList(manySubtitlesTestResult, LanguageResolver.getDefault().getLocale("Vietnamese"));
assertEquals(1, subtitleList.size());
}
@Test
public void getSubtitleListLink() throws Exception {
assertEquals(testResult.getURL().toString(), client.getSubtitleListLink(testResult).toURL().toString());
}
}

View File

@ -4,21 +4,30 @@ package net.sourceforge.filebot.web;
import static org.junit.Assert.assertEquals;
import java.net.URI;
import java.net.URL;
import java.util.List;
import org.junit.BeforeClass;
import org.junit.Test;
public class TVDotComClientTest {
private static TVDotComClient tvdotcom = new TVDotComClient();
private static HyperLink testResult;
private static HyperLink singleSeasonTestResult;
private static HyperLink manySeasonsTestResult;
private static HyperLink testResult = new HyperLink("Buffy the Vampire Slayer", URI.create("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html"));
private static HyperLink singleSeasonTestResult = new HyperLink("Firefly", URI.create("http://www.tv.com/firefly/show/7097/episode_listings.html"));
private static HyperLink manySeasonsTestResult = new HyperLink("Doctor Who", URI.create("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
private TVDotComClient tvdotcom = new TVDotComClient();
@BeforeClass
public static void setUpBeforeClass() throws Exception {
testResult = new HyperLink("Buffy the Vampire Slayer", new URL("http://www.tv.com/buffy-the-vampire-slayer/show/10/episode_listings.html"));
singleSeasonTestResult = new HyperLink("Firefly", new URL("http://www.tv.com/firefly/show/7097/episode_listings.html"));
manySeasonsTestResult = new HyperLink("Doctor Who", new URL("http://www.tv.com/doctor-who/show/355/episode_listings.html"));
}
@Test
public void search() throws Exception {
List<SearchResult> results = tvdotcom.search("Buffy");
@ -26,7 +35,7 @@ public class TVDotComClientTest {
HyperLink result = (HyperLink) results.get(0);
assertEquals(testResult.getName(), result.getName());
assertEquals(testResult.getURI(), result.getURI());
assertEquals(testResult.getURL().toString(), result.getURL().toString());
}

View File

@ -13,9 +13,10 @@ import org.junit.Test;
public class TVRageClientTest {
private static TVRageClient tvrage = new TVRageClient();
private static TVRageSearchResult testResult = new TVRageSearchResult("Buffy the Vampire Slayer", 2930, "http://www.tvrage.com/Buffy_The_Vampire_Slayer");
private TVRageClient tvrage = new TVRageClient();
@Test
public void search() throws Exception {