1
0
mirror of https://github.com/mitb-archive/filebot synced 2024-11-15 22:05:00 -05:00

* use xml anime page to get episode information

This commit is contained in:
Reinhard Pointner 2009-07-13 12:40:27 +00:00
parent 02fc6180ab
commit cdf2487f2c
3 changed files with 80 additions and 36 deletions

View File

@ -14,6 +14,8 @@ import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.Icon;
@ -102,20 +104,45 @@ public class AnidbClient implements EpisodeListProvider {
}
protected String selectOfficialTitle(Document animePage, String languageName) {
// create xpath query for official title of the given language
// e.g. //*[@class='data']//*[contains(@class, 'official') and .//*[contains(@title, 'english')]]//LABEL
@Override
public List<Episode> getEpisodeList(SearchResult searchResult) throws IOException, SAXException {
int aid = getAnimeID(getEpisodeListLink(searchResult));
String condition = String.format(".//*[contains(@title, '%s')]", languageName.toLowerCase());
String xpath = String.format("//*[@class='data']//*[contains(@class, 'official') and %s]//LABEL", condition);
// get anime page as xml
Document dom = getDocument(new URL("http", host, "/perl-bin/animedb.pl?show=xml&t=anime&aid=" + aid));
return selectString(xpath, animePage);
// select main title
String animeTitle = selectString("//anime/titles/title[@type='main']/text()", dom);
List<Episode> episodes = new ArrayList<Episode>(25);
for (Node node : selectNodes("//anime/eps/ep", dom)) {
String flags = getTextContent("flags", node);
// allow only normal and recap episodes
if (flags == null || flags.equals("2")) {
String number = getTextContent("epno", node);
String title = selectString(".//title[@lang='en']", node);
// no seasons for anime
episodes.add(new Episode(animeTitle, null, number, title));
}
}
// sanity check
if (episodes.isEmpty()) {
// anime page xml doesn't work sometimes
Logger.getLogger(getClass().getName()).warning(String.format("Failed to parse episode data from xml: %s (%d)", searchResult, aid));
// fall back to good old page scraper
return scrapeEpisodeList(searchResult);
}
return episodes;
}
@Override
public List<Episode> getEpisodeList(SearchResult searchResult) throws IOException, SAXException {
protected List<Episode> scrapeEpisodeList(SearchResult searchResult) throws IOException, SAXException {
Document dom = getHtmlDocument(getEpisodeListLink(searchResult).toURL());
// use title from anime page
@ -142,6 +169,30 @@ public class AnidbClient implements EpisodeListProvider {
}
protected int getAnimeID(URI uri) {
// e.g. http://anidb.net/perl-bin/animedb.pl?show=anime&aid=26
if (uri.getQuery() != null) {
Matcher query = Pattern.compile("aid=(\\d+)").matcher(uri.getQuery());
if (query.find()) {
return Integer.parseInt(query.group(1));
}
}
// e.g. http://anidb.net/a26
if (uri.getPath() != null) {
Matcher path = Pattern.compile("/a(\\d+)$").matcher(uri.getPath());
if (path.find()) {
return Integer.parseInt(path.group(1));
}
}
// no aid found
throw new IllegalArgumentException("URI does not contain an aid: " + uri);
}
@Override
public URI getEpisodeListLink(SearchResult searchResult) {
return ((HyperLink) searchResult).toURI();

View File

@ -75,18 +75,23 @@ public final class WebRequest {
}
public static Document getDocument(URL url) throws SAXException, IOException, ParserConfigurationException {
return getDocument(url.toString());
}
public static Document getDocument(String url) throws SAXException, IOException, ParserConfigurationException {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(url);
public static Document getDocument(URL url) throws IOException, SAXException {
return getDocument(new InputSource(getReader(url.openConnection())));
}
public static Document getDocument(InputStream inputStream) throws SAXException, IOException, ParserConfigurationException {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputStream);
return getDocument(new InputSource(inputStream));
}
public static Document getDocument(InputSource source) throws IOException, SAXException {
try {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source);
} catch (ParserConfigurationException e) {
// will never happen
throw new RuntimeException(e);
}
}
@ -129,15 +134,12 @@ public final class WebRequest {
private static Charset getCharset(String contentType) {
if (contentType != null) {
// e.g. Content-Type: text/html; charset=iso-8859-1
Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(contentType);
Matcher matcher = Pattern.compile("charset=(\\p{Graph}+)").matcher(contentType);
if (matcher.matches()) {
String charsetName = matcher.group(1);
if (matcher.find()) {
try {
return Charset.forName(charsetName);
} catch (Exception e) {
return Charset.forName(matcher.group(1));
} catch (IllegalArgumentException e) {
Logger.getLogger(WebRequest.class.getName()).log(Level.WARNING, e.getMessage());
}
}
@ -148,6 +150,9 @@ public final class WebRequest {
}
/**
* Dummy constructor to prevent instantiation.
*/
private WebRequest() {
throw new UnsupportedOperationException();
}

View File

@ -129,18 +129,6 @@ public class AnidbClientTest {
}
@Test
public void selectEnglishTitle() throws Exception {
assertEquals("Banner of the Stars", anidb.selectOfficialTitle(getHtmlDocument(new URL("http://anidb.net/a4")), "English"));
}
@Test
public void selectJapaneseTitle() throws Exception {
assertEquals("十二国記", anidb.selectOfficialTitle(getHtmlDocument(twelvekingdomsSearchResult.getURL()), "Japanese"));
}
@Test
public void getEpisodeListLink() throws Exception {
assertEquals(monsterSearchResult.getURL().toString(), anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());