From cdf2487f2c19508998e6297b34e94549dedd0be4 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Mon, 13 Jul 2009 12:40:27 +0000 Subject: [PATCH] * use xml anime page to get episode information --- .../sourceforge/filebot/web/AnidbClient.java | 69 ++++++++++++++++--- .../sourceforge/filebot/web/WebRequest.java | 35 ++++++---- .../filebot/web/AnidbClientTest.java | 12 ---- 3 files changed, 80 insertions(+), 36 deletions(-) diff --git a/source/net/sourceforge/filebot/web/AnidbClient.java b/source/net/sourceforge/filebot/web/AnidbClient.java index ae72f769..86563474 100644 --- a/source/net/sourceforge/filebot/web/AnidbClient.java +++ b/source/net/sourceforge/filebot/web/AnidbClient.java @@ -14,6 +14,8 @@ import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.swing.Icon; @@ -102,20 +104,45 @@ public class AnidbClient implements EpisodeListProvider { } - protected String selectOfficialTitle(Document animePage, String languageName) { - // create xpath query for official title of the given language - // e.g. //*[@class='data']//*[contains(@class, 'official') and .//*[contains(@title, 'english')]]//LABEL + @Override + public List getEpisodeList(SearchResult searchResult) throws IOException, SAXException { + int aid = getAnimeID(getEpisodeListLink(searchResult)); - String condition = String.format(".//*[contains(@title, '%s')]", languageName.toLowerCase()); - String xpath = String.format("//*[@class='data']//*[contains(@class, 'official') and %s]//LABEL", condition); + // get anime page as xml + Document dom = getDocument(new URL("http", host, "/perl-bin/animedb.pl?show=xml&t=anime&aid=" + aid)); - return selectString(xpath, animePage); + // select main title + String animeTitle = selectString("//anime/titles/title[@type='main']/text()", dom); + + List episodes = new ArrayList(25); + + for (Node node : selectNodes("//anime/eps/ep", dom)) { + String flags = getTextContent("flags", node); + + // allow only normal and recap episodes + if (flags == null || flags.equals("2")) { + String number = getTextContent("epno", node); + String title = selectString(".//title[@lang='en']", node); + + // no seasons for anime + episodes.add(new Episode(animeTitle, null, number, title)); + } + } + + // sanity check + if (episodes.isEmpty()) { + // anime page xml doesn't work sometimes + Logger.getLogger(getClass().getName()).warning(String.format("Failed to parse episode data from xml: %s (%d)", searchResult, aid)); + + // fall back to good old page scraper + return scrapeEpisodeList(searchResult); + } + + return episodes; } - @Override - public List getEpisodeList(SearchResult searchResult) throws IOException, SAXException { - + protected List scrapeEpisodeList(SearchResult searchResult) throws IOException, SAXException { Document dom = getHtmlDocument(getEpisodeListLink(searchResult).toURL()); // use title from anime page @@ -142,6 +169,30 @@ public class AnidbClient implements EpisodeListProvider { } + protected int getAnimeID(URI uri) { + // e.g. http://anidb.net/perl-bin/animedb.pl?show=anime&aid=26 + if (uri.getQuery() != null) { + Matcher query = Pattern.compile("aid=(\\d+)").matcher(uri.getQuery()); + + if (query.find()) { + return Integer.parseInt(query.group(1)); + } + } + + // e.g. http://anidb.net/a26 + if (uri.getPath() != null) { + Matcher path = Pattern.compile("/a(\\d+)$").matcher(uri.getPath()); + + if (path.find()) { + return Integer.parseInt(path.group(1)); + } + } + + // no aid found + throw new IllegalArgumentException("URI does not contain an aid: " + uri); + } + + @Override public URI getEpisodeListLink(SearchResult searchResult) { return ((HyperLink) searchResult).toURI(); diff --git a/source/net/sourceforge/filebot/web/WebRequest.java b/source/net/sourceforge/filebot/web/WebRequest.java index c8d0e5d1..ff5ae6e2 100644 --- a/source/net/sourceforge/filebot/web/WebRequest.java +++ b/source/net/sourceforge/filebot/web/WebRequest.java @@ -75,18 +75,23 @@ public final class WebRequest { } - public static Document getDocument(URL url) throws SAXException, IOException, ParserConfigurationException { - return getDocument(url.toString()); - } - - - public static Document getDocument(String url) throws SAXException, IOException, ParserConfigurationException { - return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(url); + public static Document getDocument(URL url) throws IOException, SAXException { + return getDocument(new InputSource(getReader(url.openConnection()))); } public static Document getDocument(InputStream inputStream) throws SAXException, IOException, ParserConfigurationException { - return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputStream); + return getDocument(new InputSource(inputStream)); + } + + + public static Document getDocument(InputSource source) throws IOException, SAXException { + try { + return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); + } catch (ParserConfigurationException e) { + // will never happen + throw new RuntimeException(e); + } } @@ -129,15 +134,12 @@ public final class WebRequest { private static Charset getCharset(String contentType) { if (contentType != null) { // e.g. Content-Type: text/html; charset=iso-8859-1 - Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE); - Matcher matcher = pattern.matcher(contentType); + Matcher matcher = Pattern.compile("charset=(\\p{Graph}+)").matcher(contentType); - if (matcher.matches()) { - String charsetName = matcher.group(1); - + if (matcher.find()) { try { - return Charset.forName(charsetName); - } catch (Exception e) { + return Charset.forName(matcher.group(1)); + } catch (IllegalArgumentException e) { Logger.getLogger(WebRequest.class.getName()).log(Level.WARNING, e.getMessage()); } } @@ -148,6 +150,9 @@ public final class WebRequest { } + /** + * Dummy constructor to prevent instantiation. + */ private WebRequest() { throw new UnsupportedOperationException(); } diff --git a/test/net/sourceforge/filebot/web/AnidbClientTest.java b/test/net/sourceforge/filebot/web/AnidbClientTest.java index 65216cd3..6bbcfd12 100644 --- a/test/net/sourceforge/filebot/web/AnidbClientTest.java +++ b/test/net/sourceforge/filebot/web/AnidbClientTest.java @@ -129,18 +129,6 @@ public class AnidbClientTest { } - @Test - public void selectEnglishTitle() throws Exception { - assertEquals("Banner of the Stars", anidb.selectOfficialTitle(getHtmlDocument(new URL("http://anidb.net/a4")), "English")); - } - - - @Test - public void selectJapaneseTitle() throws Exception { - assertEquals("十二国記", anidb.selectOfficialTitle(getHtmlDocument(twelvekingdomsSearchResult.getURL()), "Japanese")); - } - - @Test public void getEpisodeListLink() throws Exception { assertEquals(monsterSearchResult.getURL().toString(), anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());