disabled namespaces in nekohtml parser

This commit is contained in:
Reinhard Pointner 2008-02-07 22:05:59 +00:00
parent 319a528542
commit 3c0296d11e
6 changed files with 38 additions and 122 deletions

View File

@ -28,11 +28,11 @@ public class Torrent {
public Torrent(File torrent) throws IOException {
FileInputStream in = new FileInputStream(torrent);
BufferedInputStream in = new BufferedInputStream(new FileInputStream(torrent));
Map<?, ?> torrentMap = null;
try {
torrentMap = BDecoder.decode(new BufferedInputStream(in));
torrentMap = BDecoder.decode(in);
} finally {
in.close();
}

View File

@ -16,6 +16,7 @@ import java.util.Map;
import java.util.TreeMap;
import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
@ -42,19 +43,19 @@ public class AnidbSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom);
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size());
if (!nodes.isEmpty())
for (Node node : nodes) {
String type = HtmlUtil.selectString("./TD[2]/text()", node);
String type = XPathUtil.selectString("./TD[2]/text()", node);
// we only want shows
if (type.equalsIgnoreCase("tv series")) {
Node titleNode = HtmlUtil.selectNode("./TD[1]/A", node);
Node titleNode = XPathUtil.selectNode("./TD[1]/A", node);
String title = HtmlUtil.selectString("text()", titleNode);
String href = HtmlUtil.selectString("@href", titleNode);
String title = XPathUtil.selectString("text()", titleNode);
String href = XPathUtil.selectString("@href", titleNode);
String file = "/perl-bin/" + href;
@ -70,11 +71,11 @@ public class AnidbSearchEngine extends SearchEngine {
}
else {
// we might have been redirected to the episode list page directly
List<Node> results = HtmlUtil.selectNodes("//TABLE[@class='eplist']", dom);
List<Node> results = XPathUtil.selectNodes("//TABLE[@class='eplist']", dom);
if (!results.isEmpty()) {
// get show's name from the document
String header = HtmlUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom);
String header = XPathUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom);
String title = header.replaceFirst("Anime:\\s*", "");
cache.put(title, getSearchUrl(searchterm));
@ -92,7 +93,7 @@ public class AnidbSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom);
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom);
LinkedList<Episode> list = new LinkedList<Episode>();
@ -101,8 +102,8 @@ public class AnidbSearchEngine extends SearchEngine {
f.setGroupingUsed(false);
for (Node node : nodes) {
String number = HtmlUtil.selectString("./TD[1]/A/text()", node);
String title = HtmlUtil.selectString("./TD[2]/SPAN/text()", node);
String number = XPathUtil.selectString("./TD[1]/A/text()", node);
String title = XPathUtil.selectString("./TD[2]/SPAN/text()", node);
if (title.startsWith("recap"))
title = title.replaceFirst("recap", "");

View File

@ -9,21 +9,17 @@ import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.sourceforge.tuned.XPathUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
class HtmlUtil {
public class HtmlUtil {
private static Charset getCharset(String contentType) {
if (contentType != null) {
@ -58,34 +54,10 @@ class HtmlUtil {
public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(reader));
return parser.getDocument();
}
public static String selectString(String xpath, Node node) {
return XPathUtil.selectString(xpath, node, "html", getNameSpace(node)).trim();
}
public static List<Node> selectNodes(String xpath, Node node) {
return XPathUtil.selectNodes(xpath, node, "html", getNameSpace(node));
}
public static Node selectNode(String xpath, Node node) {
return XPathUtil.selectNode(xpath, node, "html", getNameSpace(node));
}
private static String getNameSpace(Node node) {
if (node instanceof Document) {
// select root element
return XPathUtil.selectNode("/*", node, null, null).getNamespaceURI();
}
return node.getNamespaceURI();
}
}

View File

@ -17,6 +17,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
@ -43,13 +44,13 @@ public class TVRageSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]//TR/TD/A[1]", dom);
List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]/*/TR/TD/A[1]", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size());
for (Node node : nodes) {
String href = HtmlUtil.selectString("@href", node);
String title = HtmlUtil.selectString("text()", node);
String href = XPathUtil.selectString("@href", node);
String title = XPathUtil.selectString("text()", node);
try {
URL url = new URL(href);
@ -69,18 +70,18 @@ public class TVRageSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom);
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom);
ArrayList<Episode> episodes = new ArrayList<Episode>();
for (Node node : nodes) {
String seasonAndEpisodeNumber = HtmlUtil.selectString("./TD[2]/A/text()", node);
String title = HtmlUtil.selectString("./TD[4]/A/text()", node);
String seasonAndEpisodeNumber = XPathUtil.selectString("./TD[2]/A/text()", node);
String title = XPathUtil.selectString("./TD[4]/A/text()", node);
List<Node> precedings = HtmlUtil.selectNodes("../preceding-sibling::TABLE", node);
List<Node> precedings = XPathUtil.selectNodes("../preceding-sibling::TABLE", node);
Node previousTable = precedings.get(precedings.size() - 1);
String seasonHeader = HtmlUtil.selectString("./TR/TD/FONT/text()", previousTable);
String seasonHeader = XPathUtil.selectString("./TR/TD/FONT/text()", previousTable);
Matcher seasonMatcher = Pattern.compile("Season (\\d+)").matcher(seasonHeader);

View File

@ -16,6 +16,7 @@ import java.util.Map;
import java.util.TreeMap;
import net.sourceforge.filebot.resources.ResourceManager;
import net.sourceforge.tuned.XPathUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
@ -42,7 +43,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
List<Node> nodes = HtmlUtil.selectNodes("//html:TABLE[@id='search-results']//html:SPAN/html:A", dom);
List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='search-results']//SPAN/A", dom);
ArrayList<String> shows = new ArrayList<String>(nodes.size());
@ -52,7 +53,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
// we only want search results that are shows
if (category.toLowerCase().startsWith("show")) {
String title = node.getTextContent();
String href = HtmlUtil.selectString("@href", node);
String href = XPathUtil.selectString("@href", node);
try {
URL url = new URL(href);
@ -74,7 +75,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
List<Node> nodes = HtmlUtil.selectNodes("//html:DIV[@id='episode-listing']/html:DIV/html:TABLE/html:TR/html:TD/ancestor::html:TR", dom);
List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='episode-listing']/DIV/TABLE/TR/TD/ancestor::TR", dom);
String seasonString = null;
@ -93,8 +94,8 @@ public class TvdotcomSearchEngine extends SearchEngine {
episodeOffset = 0;
for (Node node : nodes) {
String episodeNumber = HtmlUtil.selectString("./html:TD[1]/text()", node);
String title = HtmlUtil.selectString("./html:TD[2]/html:A/text()", node);
String episodeNumber = XPathUtil.selectString("./TD[1]/text()", node);
String title = XPathUtil.selectString("./TD[2]/A/text()", node);
try {
// format number of episode
@ -105,7 +106,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
episodeNumber = numberFormat.format(n - episodeOffset);
} catch (NumberFormatException e) {
// episode number can be "Pilot" or "Special"
// episode number may be "Pilot", "Special", etc.
}
episodes.add(new Episode(showname, seasonString, episodeNumber, title));

View File

@ -3,11 +3,8 @@ package net.sourceforge.tuned;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
@ -18,9 +15,9 @@ import org.w3c.dom.NodeList;
public class XPathUtil {
public static Node selectNode(String xpath, Object node, String namespacePrefix, String namespace) {
public static Node selectNode(String xpath, Object node) {
try {
XPath xp = createXPath(namespacePrefix, namespace);
XPath xp = XPathFactory.newInstance().newXPath();
return (Node) xp.evaluate(xpath, node, XPathConstants.NODE);
} catch (Exception e) {
@ -29,9 +26,9 @@ public class XPathUtil {
}
public static List<Node> selectNodes(String xpath, Object node, String namespacePrefix, String namespace) {
public static List<Node> selectNodes(String xpath, Object node) {
try {
XPath xp = createXPath(namespacePrefix, namespace);
XPath xp = XPathFactory.newInstance().newXPath();
NodeList nodeList = (NodeList) xp.evaluate(xpath, node, XPathConstants.NODESET);
@ -48,69 +45,13 @@ public class XPathUtil {
}
public static String selectString(String xpath, Object node, String namespacePrefix, String namespace) {
public static String selectString(String xpath, Object node) {
try {
XPath xp = createXPath(namespacePrefix, namespace);
return (String) xp.evaluate(xpath, node, XPathConstants.STRING);
XPath xp = XPathFactory.newInstance().newXPath();
return ((String) xp.evaluate(xpath, node, XPathConstants.STRING)).trim();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private static XPath createXPath(String namespacePrefix, String namespace) {
XPath xp = XPathFactory.newInstance().newXPath();
if (namespacePrefix != null && namespace != null) {
xp.setNamespaceContext(new NamespaceContextProvider(namespacePrefix, namespace));
}
return xp;
}
private static class NamespaceContextProvider implements NamespaceContext {
String boundPrefix;
String boundURI;
NamespaceContextProvider(String prefix, String URI) {
boundPrefix = prefix;
boundURI = URI;
}
public String getNamespaceURI(String prefix) {
if (prefix.equals(boundPrefix)) {
return boundURI;
} else if (prefix.equals(XMLConstants.XML_NS_PREFIX)) {
return XMLConstants.XML_NS_URI;
} else if (prefix.equals(XMLConstants.XMLNS_ATTRIBUTE)) {
return XMLConstants.XMLNS_ATTRIBUTE_NS_URI;
} else {
return XMLConstants.NULL_NS_URI;
}
}
public String getPrefix(String namespaceURI) {
if (namespaceURI.equals(boundURI)) {
return boundPrefix;
} else if (namespaceURI.equals(XMLConstants.XML_NS_URI)) {
return XMLConstants.XML_NS_PREFIX;
} else if (namespaceURI.equals(XMLConstants.XMLNS_ATTRIBUTE_NS_URI)) {
return XMLConstants.XMLNS_ATTRIBUTE;
} else {
return null;
}
}
@SuppressWarnings("unchecked")
public Iterator getPrefixes(String namespaceURI) {
return null;
}
}
}