disabled namespaces in nekohtml parser

This commit is contained in:
Reinhard Pointner 2008-02-07 22:05:59 +00:00
parent 319a528542
commit 3c0296d11e
6 changed files with 38 additions and 122 deletions

View File

@ -28,11 +28,11 @@ public class Torrent {
public Torrent(File torrent) throws IOException { public Torrent(File torrent) throws IOException {
FileInputStream in = new FileInputStream(torrent); BufferedInputStream in = new BufferedInputStream(new FileInputStream(torrent));
Map<?, ?> torrentMap = null; Map<?, ?> torrentMap = null;
try { try {
torrentMap = BDecoder.decode(new BufferedInputStream(in)); torrentMap = BDecoder.decode(in);
} finally { } finally {
in.close(); in.close();
} }

View File

@ -16,6 +16,7 @@ import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import net.sourceforge.filebot.resources.ResourceManager; import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Node; import org.w3c.dom.Node;
@ -42,19 +43,19 @@ public class AnidbSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm)); Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom); List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size()); ArrayList<String> shows = new ArrayList<String>(nodes.size());
if (!nodes.isEmpty()) if (!nodes.isEmpty())
for (Node node : nodes) { for (Node node : nodes) {
String type = HtmlUtil.selectString("./TD[2]/text()", node); String type = XPathUtil.selectString("./TD[2]/text()", node);
// we only want shows // we only want shows
if (type.equalsIgnoreCase("tv series")) { if (type.equalsIgnoreCase("tv series")) {
Node titleNode = HtmlUtil.selectNode("./TD[1]/A", node); Node titleNode = XPathUtil.selectNode("./TD[1]/A", node);
String title = HtmlUtil.selectString("text()", titleNode); String title = XPathUtil.selectString("text()", titleNode);
String href = HtmlUtil.selectString("@href", titleNode); String href = XPathUtil.selectString("@href", titleNode);
String file = "/perl-bin/" + href; String file = "/perl-bin/" + href;
@ -70,11 +71,11 @@ public class AnidbSearchEngine extends SearchEngine {
} }
else { else {
// we might have been redirected to the episode list page directly // we might have been redirected to the episode list page directly
List<Node> results = HtmlUtil.selectNodes("//TABLE[@class='eplist']", dom); List<Node> results = XPathUtil.selectNodes("//TABLE[@class='eplist']", dom);
if (!results.isEmpty()) { if (!results.isEmpty()) {
// get show's name from the document // get show's name from the document
String header = HtmlUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom); String header = XPathUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom);
String title = header.replaceFirst("Anime:\\s*", ""); String title = header.replaceFirst("Anime:\\s*", "");
cache.put(title, getSearchUrl(searchterm)); cache.put(title, getSearchUrl(searchterm));
@ -92,7 +93,7 @@ public class AnidbSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season)); Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom); List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom);
LinkedList<Episode> list = new LinkedList<Episode>(); LinkedList<Episode> list = new LinkedList<Episode>();
@ -101,8 +102,8 @@ public class AnidbSearchEngine extends SearchEngine {
f.setGroupingUsed(false); f.setGroupingUsed(false);
for (Node node : nodes) { for (Node node : nodes) {
String number = HtmlUtil.selectString("./TD[1]/A/text()", node); String number = XPathUtil.selectString("./TD[1]/A/text()", node);
String title = HtmlUtil.selectString("./TD[2]/SPAN/text()", node); String title = XPathUtil.selectString("./TD[2]/SPAN/text()", node);
if (title.startsWith("recap")) if (title.startsWith("recap"))
title = title.replaceFirst("recap", ""); title = title.replaceFirst("recap", "");

View File

@ -9,21 +9,17 @@ import java.io.Reader;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import net.sourceforge.tuned.XPathUtil;
import org.cyberneko.html.parsers.DOMParser; import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
class HtmlUtil { public class HtmlUtil {
private static Charset getCharset(String contentType) { private static Charset getCharset(String contentType) {
if (contentType != null) { if (contentType != null) {
@ -58,34 +54,10 @@ class HtmlUtil {
public static Document getHtmlDocument(Reader reader) throws SAXException, IOException { public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
DOMParser parser = new DOMParser(); DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(reader)); parser.parse(new InputSource(reader));
return parser.getDocument(); return parser.getDocument();
} }
public static String selectString(String xpath, Node node) {
return XPathUtil.selectString(xpath, node, "html", getNameSpace(node)).trim();
}
public static List<Node> selectNodes(String xpath, Node node) {
return XPathUtil.selectNodes(xpath, node, "html", getNameSpace(node));
}
public static Node selectNode(String xpath, Node node) {
return XPathUtil.selectNode(xpath, node, "html", getNameSpace(node));
}
private static String getNameSpace(Node node) {
if (node instanceof Document) {
// select root element
return XPathUtil.selectNode("/*", node, null, null).getNamespaceURI();
}
return node.getNamespaceURI();
}
} }

View File

@ -17,6 +17,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.sourceforge.filebot.resources.ResourceManager; import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Node; import org.w3c.dom.Node;
@ -43,13 +44,13 @@ public class TVRageSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm)); Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]//TR/TD/A[1]", dom); List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]/*/TR/TD/A[1]", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size()); ArrayList<String> shows = new ArrayList<String>(nodes.size());
for (Node node : nodes) { for (Node node : nodes) {
String href = HtmlUtil.selectString("@href", node); String href = XPathUtil.selectString("@href", node);
String title = HtmlUtil.selectString("text()", node); String title = XPathUtil.selectString("text()", node);
try { try {
URL url = new URL(href); URL url = new URL(href);
@ -69,18 +70,18 @@ public class TVRageSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season)); Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom); List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom);
ArrayList<Episode> episodes = new ArrayList<Episode>(); ArrayList<Episode> episodes = new ArrayList<Episode>();
for (Node node : nodes) { for (Node node : nodes) {
String seasonAndEpisodeNumber = HtmlUtil.selectString("./TD[2]/A/text()", node); String seasonAndEpisodeNumber = XPathUtil.selectString("./TD[2]/A/text()", node);
String title = HtmlUtil.selectString("./TD[4]/A/text()", node); String title = XPathUtil.selectString("./TD[4]/A/text()", node);
List<Node> precedings = HtmlUtil.selectNodes("../preceding-sibling::TABLE", node); List<Node> precedings = XPathUtil.selectNodes("../preceding-sibling::TABLE", node);
Node previousTable = precedings.get(precedings.size() - 1); Node previousTable = precedings.get(precedings.size() - 1);
String seasonHeader = HtmlUtil.selectString("./TR/TD/FONT/text()", previousTable); String seasonHeader = XPathUtil.selectString("./TR/TD/FONT/text()", previousTable);
Matcher seasonMatcher = Pattern.compile("Season (\\d+)").matcher(seasonHeader); Matcher seasonMatcher = Pattern.compile("Season (\\d+)").matcher(seasonHeader);

View File

@ -16,6 +16,7 @@ import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import net.sourceforge.filebot.resources.ResourceManager; import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Node; import org.w3c.dom.Node;
@ -42,7 +43,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm)); Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//html:TABLE[@id='search-results']//html:SPAN/html:A", dom); List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='search-results']//SPAN/A", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size()); ArrayList<String> shows = new ArrayList<String>(nodes.size());
@ -52,7 +53,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
// we only want search results that are shows // we only want search results that are shows
if (category.toLowerCase().startsWith("show")) { if (category.toLowerCase().startsWith("show")) {
String title = node.getTextContent(); String title = node.getTextContent();
String href = HtmlUtil.selectString("@href", node); String href = XPathUtil.selectString("@href", node);
try { try {
URL url = new URL(href); URL url = new URL(href);
@ -74,7 +75,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season)); Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//html:DIV[@id='episode-listing']/html:DIV/html:TABLE/html:TR/html:TD/ancestor::html:TR", dom); List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='episode-listing']/DIV/TABLE/TR/TD/ancestor::TR", dom);
String seasonString = null; String seasonString = null;
@ -93,8 +94,8 @@ public class TvdotcomSearchEngine extends SearchEngine {
episodeOffset = 0; episodeOffset = 0;
for (Node node : nodes) { for (Node node : nodes) {
String episodeNumber = HtmlUtil.selectString("./html:TD[1]/text()", node); String episodeNumber = XPathUtil.selectString("./TD[1]/text()", node);
String title = HtmlUtil.selectString("./html:TD[2]/html:A/text()", node); String title = XPathUtil.selectString("./TD[2]/A/text()", node);
try { try {
// format number of episode // format number of episode
@ -105,7 +106,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
episodeNumber = numberFormat.format(n - episodeOffset); episodeNumber = numberFormat.format(n - episodeOffset);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
// episode number can be "Pilot" or "Special" // episode number may be "Pilot", "Special", etc.
} }
episodes.add(new Episode(showname, seasonString, episodeNumber, title)); episodes.add(new Episode(showname, seasonString, episodeNumber, title));

View File

@ -3,11 +3,8 @@ package net.sourceforge.tuned;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPath; import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory; import javax.xml.xpath.XPathFactory;
@ -18,9 +15,9 @@ import org.w3c.dom.NodeList;
public class XPathUtil { public class XPathUtil {
public static Node selectNode(String xpath, Object node, String namespacePrefix, String namespace) { public static Node selectNode(String xpath, Object node) {
try { try {
XPath xp = createXPath(namespacePrefix, namespace); XPath xp = XPathFactory.newInstance().newXPath();
return (Node) xp.evaluate(xpath, node, XPathConstants.NODE); return (Node) xp.evaluate(xpath, node, XPathConstants.NODE);
} catch (Exception e) { } catch (Exception e) {
@ -29,9 +26,9 @@ public class XPathUtil {
} }
public static List<Node> selectNodes(String xpath, Object node, String namespacePrefix, String namespace) { public static List<Node> selectNodes(String xpath, Object node) {
try { try {
XPath xp = createXPath(namespacePrefix, namespace); XPath xp = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xp.evaluate(xpath, node, XPathConstants.NODESET); NodeList nodeList = (NodeList) xp.evaluate(xpath, node, XPathConstants.NODESET);
@ -48,69 +45,13 @@ public class XPathUtil {
} }
public static String selectString(String xpath, Object node, String namespacePrefix, String namespace) { public static String selectString(String xpath, Object node) {
try { try {
XPath xp = createXPath(namespacePrefix, namespace); XPath xp = XPathFactory.newInstance().newXPath();
return (String) xp.evaluate(xpath, node, XPathConstants.STRING); return ((String) xp.evaluate(xpath, node, XPathConstants.STRING)).trim();
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
private static XPath createXPath(String namespacePrefix, String namespace) {
XPath xp = XPathFactory.newInstance().newXPath();
if (namespacePrefix != null && namespace != null) {
xp.setNamespaceContext(new NamespaceContextProvider(namespacePrefix, namespace));
}
return xp;
}
private static class NamespaceContextProvider implements NamespaceContext {
String boundPrefix;
String boundURI;
NamespaceContextProvider(String prefix, String URI) {
boundPrefix = prefix;
boundURI = URI;
}
public String getNamespaceURI(String prefix) {
if (prefix.equals(boundPrefix)) {
return boundURI;
} else if (prefix.equals(XMLConstants.XML_NS_PREFIX)) {
return XMLConstants.XML_NS_URI;
} else if (prefix.equals(XMLConstants.XMLNS_ATTRIBUTE)) {
return XMLConstants.XMLNS_ATTRIBUTE_NS_URI;
} else {
return XMLConstants.NULL_NS_URI;
}
}
public String getPrefix(String namespaceURI) {
if (namespaceURI.equals(boundURI)) {
return boundPrefix;
} else if (namespaceURI.equals(XMLConstants.XML_NS_URI)) {
return XMLConstants.XML_NS_PREFIX;
} else if (namespaceURI.equals(XMLConstants.XMLNS_ATTRIBUTE_NS_URI)) {
return XMLConstants.XMLNS_ATTRIBUTE;
} else {
return null;
}
}
@SuppressWarnings("unchecked")
public Iterator getPrefixes(String namespaceURI) {
return null;
}
}
} }