mirror of
https://github.com/mitb-archive/filebot
synced 2025-03-10 06:20:27 -04:00
* try to make imdb scraper more robust
This commit is contained in:
parent
e7d8e8bb05
commit
e1dea3b514
@ -15,6 +15,8 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Scanner;
|
import java.util.Scanner;
|
||||||
|
import java.util.logging.Level;
|
||||||
|
import java.util.logging.Logger;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -145,18 +147,22 @@ public class IMDbClient extends AbstractEpisodeListProvider implements MovieIden
|
|||||||
Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query)));
|
Document dom = parsePage(new URL("http", host, "/find?s=tt&q=" + encode(query)));
|
||||||
|
|
||||||
// select movie links followed by year in parenthesis
|
// select movie links followed by year in parenthesis
|
||||||
List<Node> nodes = selectNodes("//TABLE//A[string-length(substring-after(substring-before(following::text(),')'),'(')) = 4 and count(following-sibling::SMALL) = 0]", dom);
|
List<Node> nodes = selectNodes("//TABLE//A[substring-after(substring-before(following::text(),')'),'(')]", dom);
|
||||||
List<Movie> results = new ArrayList<Movie>(nodes.size());
|
List<Movie> results = new ArrayList<Movie>(nodes.size());
|
||||||
|
|
||||||
for (Node node : nodes) {
|
for (Node node : nodes) {
|
||||||
String name = node.getTextContent().trim();
|
|
||||||
String year = node.getNextSibling().getTextContent().trim().replaceAll("[\\p{Punct}\\p{Space}]+", ""); // remove non-number characters
|
|
||||||
String href = getAttribute("href", node);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
String name = node.getTextContent().trim();
|
||||||
|
if (name.startsWith("\""))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
String year = node.getNextSibling().getTextContent().replaceAll("[\\p{Punct}\\p{Space}]+", "").trim(); // remove non-number characters
|
||||||
|
String href = getAttribute("href", node);
|
||||||
|
|
||||||
results.add(new Movie(name, Integer.parseInt(year), getImdbId(href)));
|
results.add(new Movie(name, Integer.parseInt(year), getImdbId(href)));
|
||||||
} catch (NumberFormatException e) {
|
} catch (Exception e) {
|
||||||
// ignore illegal movies (TV Shows, Videos, Video Games, etc)
|
// ignore illegal movies (TV Shows, Videos, Video Games, etc)
|
||||||
|
Logger.getLogger(getClass().getName()).log(Level.FINEST, e.getClass().getName() + ": " + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user