* fix scraper issues with (I) (II) etc tags

This commit is contained in:
Reinhard Pointner 2013-01-27 12:02:28 +00:00
parent cc57b89840
commit 3f2499fbea
2 changed files with 11 additions and 1 deletions

View File

@ -115,7 +115,7 @@ public class IMDbClient implements MovieIdentificationService {
return null;
String name = selectString("//H1/text()", dom).replaceAll("\\s+", " ").trim();
String year = new Scanner(selectNode("//H1/SPAN", dom).getTextContent()).useDelimiter("\\D+").next();
String year = new Scanner(selectNode("//H1/SPAN[@class='nobr']", dom).getTextContent()).useDelimiter("\\D+").next();
int imdbid = getImdbId(selectString("//LINK[@rel='canonical']/@href", dom));
return new Movie(name, Pattern.matches("\\d{4}", year) ? Integer.parseInt(year) : -1, imdbid, -1);

View File

@ -123,6 +123,16 @@ public class IMDbClientTest {
}
@Test
public void getMovieDescriptor4() throws Exception {
Movie movie = imdb.getMovieDescriptor(369702, null);
assertEquals("The Sea Inside", movie.getName());
assertEquals(2004, movie.getYear());
assertEquals(369702, movie.getImdbId(), 0);
}
@Test
public void getAkaMovieDescriptor() throws Exception {
Movie movie = imdb.getMovieDescriptor(106559, Locale.ENGLISH);