* use xml anime page to get episode information

2024-12-23 16:28:51 -05:00 · 2009-07-13 12:40:27 +00:00 · 2009-07-13 12:40:27 +00:00 · cdf2487f2c
commit cdf2487f2c
parent 02fc6180ab
3 changed files with 80 additions and 36 deletions
--- a/source/net/sourceforge/filebot/web/AnidbClient.java
+++ b/source/net/sourceforge/filebot/web/AnidbClient.java
@ -14,6 +14,8 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 import javax.swing.Icon;

@ -102,20 +104,45 @@ public class AnidbClient implements EpisodeListProvider {
 	}
 	

-	protected String selectOfficialTitle(Document animePage, String languageName) {
-		// create xpath query for official title of the given language
-		// e.g. //*[@class='data']//*[contains(@class, 'official') and .//*[contains(@title, 'english')]]//LABEL
+	@Override
+	public List<Episode> getEpisodeList(SearchResult searchResult) throws IOException, SAXException {
+		int aid = getAnimeID(getEpisodeListLink(searchResult));
 		
-		String condition = String.format(".//*[contains(@title, '%s')]", languageName.toLowerCase());
-		String xpath = String.format("//*[@class='data']//*[contains(@class, 'official') and %s]//LABEL", condition);
+		// get anime page as xml
+		Document dom = getDocument(new URL("http", host, "/perl-bin/animedb.pl?show=xml&t=anime&aid=" + aid));
 		
-		return selectString(xpath, animePage);
+		// select main title
+		String animeTitle = selectString("//anime/titles/title[@type='main']/text()", dom);
+		
+		List<Episode> episodes = new ArrayList<Episode>(25);
+		
+		for (Node node : selectNodes("//anime/eps/ep", dom)) {
+			String flags = getTextContent("flags", node);
+			
+			// allow only normal and recap episodes
+			if (flags == null || flags.equals("2")) {
+				String number = getTextContent("epno", node);
+				String title = selectString(".//title[@lang='en']", node);
+				
+				// no seasons for anime
+				episodes.add(new Episode(animeTitle, null, number, title));
+			}
+		}
+		
+		// sanity check 
+		if (episodes.isEmpty()) {
+			// anime page xml doesn't work sometimes
+			Logger.getLogger(getClass().getName()).warning(String.format("Failed to parse episode data from xml: %s (%d)", searchResult, aid));
+			
+			// fall back to good old page scraper
+			return scrapeEpisodeList(searchResult);
+		}
+		
+		return episodes;
 	}
 	

-	@Override
-	public List<Episode> getEpisodeList(SearchResult searchResult) throws IOException, SAXException {
-		
+	protected List<Episode> scrapeEpisodeList(SearchResult searchResult) throws IOException, SAXException {
 		Document dom = getHtmlDocument(getEpisodeListLink(searchResult).toURL());
 		
 		// use title from anime page
@ -142,6 +169,30 @@ public class AnidbClient implements EpisodeListProvider {
 	}
 	

+	protected int getAnimeID(URI uri) {
+		// e.g. http://anidb.net/perl-bin/animedb.pl?show=anime&aid=26
+		if (uri.getQuery() != null) {
+			Matcher query = Pattern.compile("aid=(\\d+)").matcher(uri.getQuery());
+			
+			if (query.find()) {
+				return Integer.parseInt(query.group(1));
+			}
+		}
+		
+		// e.g. http://anidb.net/a26
+		if (uri.getPath() != null) {
+			Matcher path = Pattern.compile("/a(\\d+)$").matcher(uri.getPath());
+			
+			if (path.find()) {
+				return Integer.parseInt(path.group(1));
+			}
+		}
+		
+		// no aid found
+		throw new IllegalArgumentException("URI does not contain an aid: " + uri);
+	}
+	
+
 	@Override
 	public URI getEpisodeListLink(SearchResult searchResult) {
 		return ((HyperLink) searchResult).toURI();
--- a/source/net/sourceforge/filebot/web/WebRequest.java
+++ b/source/net/sourceforge/filebot/web/WebRequest.java
@ -75,18 +75,23 @@ public final class WebRequest {
 	}
 	

-	public static Document getDocument(URL url) throws SAXException, IOException, ParserConfigurationException {
-		return getDocument(url.toString());
-	}
-	
-
-	public static Document getDocument(String url) throws SAXException, IOException, ParserConfigurationException {
-		return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(url);
+	public static Document getDocument(URL url) throws IOException, SAXException {
+		return getDocument(new InputSource(getReader(url.openConnection())));
 	}
 	

 	public static Document getDocument(InputStream inputStream) throws SAXException, IOException, ParserConfigurationException {
-		return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputStream);
+		return getDocument(new InputSource(inputStream));
+	}
+	
+
+	public static Document getDocument(InputSource source) throws IOException, SAXException {
+		try {
+			return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source);
+		} catch (ParserConfigurationException e) {
+			// will never happen
+			throw new RuntimeException(e);
+		}
 	}
 	

@ -129,15 +134,12 @@ public final class WebRequest {
 	private static Charset getCharset(String contentType) {
 		if (contentType != null) {
 			// e.g. Content-Type: text/html; charset=iso-8859-1
-			Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE);
-			Matcher matcher = pattern.matcher(contentType);
-			
-			if (matcher.matches()) {
-				String charsetName = matcher.group(1);
+			Matcher matcher = Pattern.compile("charset=(\\p{Graph}+)").matcher(contentType);
 			
+			if (matcher.find()) {
 				try {
-					return Charset.forName(charsetName);
-				} catch (Exception e) {
+					return Charset.forName(matcher.group(1));
+				} catch (IllegalArgumentException e) {
 					Logger.getLogger(WebRequest.class.getName()).log(Level.WARNING, e.getMessage());
 				}
 			}
@ -148,6 +150,9 @@ public final class WebRequest {
 	}
 	

+	/**
+	 * Dummy constructor to prevent instantiation.
+	 */
 	private WebRequest() {
 		throw new UnsupportedOperationException();
 	}
--- a/test/net/sourceforge/filebot/web/AnidbClientTest.java
+++ b/test/net/sourceforge/filebot/web/AnidbClientTest.java
@ -129,18 +129,6 @@ public class AnidbClientTest {
 	}
 	

-	@Test
-	public void selectEnglishTitle() throws Exception {
-		assertEquals("Banner of the Stars", anidb.selectOfficialTitle(getHtmlDocument(new URL("http://anidb.net/a4")), "English"));
-	}
-	
-
-	@Test
-	public void selectJapaneseTitle() throws Exception {
-		assertEquals("十二国記", anidb.selectOfficialTitle(getHtmlDocument(twelvekingdomsSearchResult.getURL()), "Japanese"));
-	}
-	
-
 	@Test
 	public void getEpisodeListLink() throws Exception {
 		assertEquals(monsterSearchResult.getURL().toString(), anidb.getEpisodeListLink(monsterSearchResult).toURL().toString());