disabled namespaces in nekohtml parser

2008-02-07 22:05:59 +00:00 · 2008-02-07 22:05:59 +00:00 · 3c0296d11e
parent 319a528542
commit 3c0296d11e
6 changed files with 38 additions and 122 deletions
--- a/source/net/sourceforge/filebot/torrent/Torrent.java
+++ b/source/net/sourceforge/filebot/torrent/Torrent.java
@ -28,11 +28,11 @@ public class Torrent {
 	public Torrent(File torrent) throws IOException {
-		FileInputStream in = new FileInputStream(torrent);
+		BufferedInputStream in = new BufferedInputStream(new FileInputStream(torrent));
 		Map<?, ?> torrentMap = null;
 		try {
-			torrentMap = BDecoder.decode(new BufferedInputStream(in));
+			torrentMap = BDecoder.decode(in);
 		} finally {
 			in.close();
 		}
--- a/source/net/sourceforge/filebot/web/AnidbSearchEngine.java
+++ b/source/net/sourceforge/filebot/web/AnidbSearchEngine.java
@ -16,6 +16,7 @@ import java.util.Map;
 import java.util.TreeMap;
 import net.sourceforge.filebot.resources.ResourceManager;
 import net.sourceforge.tuned.XPathUtil;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
@ -42,19 +43,19 @@ public class AnidbSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
-		List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='anime_list']//TR//TD//ancestor::TR", dom);
 		ArrayList<String> shows = new ArrayList<String>(nodes.size());
 		if (!nodes.isEmpty())
 			for (Node node : nodes) {
-				String type = HtmlUtil.selectString("./TD[2]/text()", node);
+				String type = XPathUtil.selectString("./TD[2]/text()", node);
 				// we only want shows
 				if (type.equalsIgnoreCase("tv series")) {
-					Node titleNode = HtmlUtil.selectNode("./TD[1]/A", node);
+					Node titleNode = XPathUtil.selectNode("./TD[1]/A", node);
-					String title = HtmlUtil.selectString("text()", titleNode);
+					String title = XPathUtil.selectString("text()", titleNode);
-					String href = HtmlUtil.selectString("@href", titleNode);
+					String href = XPathUtil.selectString("@href", titleNode);
 					String file = "/perl-bin/" + href;
@ -70,11 +71,11 @@ public class AnidbSearchEngine extends SearchEngine {
 			}
 		else {
 			// we might have been redirected to the episode list page directly
-			List<Node> results = HtmlUtil.selectNodes("//TABLE[@class='eplist']", dom);
+			List<Node> results = XPathUtil.selectNodes("//TABLE[@class='eplist']", dom);
 			if (!results.isEmpty()) {
 				// get show's name from the document
-				String header = HtmlUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom);
+				String header = XPathUtil.selectString("//DIV[@id='layout-content']//H1[1]/text()", dom);
 				String title = header.replaceFirst("Anime:\\s*", "");
 				cache.put(title, getSearchUrl(searchterm));
@ -92,7 +93,7 @@ public class AnidbSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
-		List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='eplist']//TR/TD/SPAN/ancestor::TR", dom);
 		LinkedList<Episode> list = new LinkedList<Episode>();
@ -101,8 +102,8 @@ public class AnidbSearchEngine extends SearchEngine {
 		f.setGroupingUsed(false);
 		for (Node node : nodes) {
-			String number = HtmlUtil.selectString("./TD[1]/A/text()", node);
+			String number = XPathUtil.selectString("./TD[1]/A/text()", node);
-			String title = HtmlUtil.selectString("./TD[2]/SPAN/text()", node);
+			String title = XPathUtil.selectString("./TD[2]/SPAN/text()", node);
 			if (title.startsWith("recap"))
 				title = title.replaceFirst("recap", "");
--- a/source/net/sourceforge/filebot/web/HtmlUtil.java
+++ b/source/net/sourceforge/filebot/web/HtmlUtil.java
@ -9,21 +9,17 @@ import java.io.Reader;
 import java.net.URL;
 import java.net.URLConnection;
 import java.nio.charset.Charset;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.zip.GZIPInputStream;
 import net.sourceforge.tuned.XPathUtil;
 import org.cyberneko.html.parsers.DOMParser;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
-class HtmlUtil {
+public class HtmlUtil {
 	private static Charset getCharset(String contentType) {
 		if (contentType != null) {
@ -58,34 +54,10 @@ class HtmlUtil {
 	public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
 		DOMParser parser = new DOMParser();
 		parser.setFeature("http://xml.org/sax/features/namespaces", false);
 		parser.parse(new InputSource(reader));
 		return parser.getDocument();
 	}
 	public static String selectString(String xpath, Node node) {
 		return XPathUtil.selectString(xpath, node, "html", getNameSpace(node)).trim();
 	}
 	public static List<Node> selectNodes(String xpath, Node node) {
 		return XPathUtil.selectNodes(xpath, node, "html", getNameSpace(node));
 	}
 	public static Node selectNode(String xpath, Node node) {
 		return XPathUtil.selectNode(xpath, node, "html", getNameSpace(node));
 	}
 	private static String getNameSpace(Node node) {
 		if (node instanceof Document) {
 			// select root element
 			return XPathUtil.selectNode("/*", node, null, null).getNamespaceURI();
 		}
 		return node.getNamespaceURI();
 	}
 }
--- a/source/net/sourceforge/filebot/web/TVRageSearchEngine.java
+++ b/source/net/sourceforge/filebot/web/TVRageSearchEngine.java
@ -17,6 +17,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import net.sourceforge.filebot.resources.ResourceManager;
 import net.sourceforge.tuned.XPathUtil;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
@ -43,13 +44,13 @@ public class TVRageSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
-		List<Node> nodes = HtmlUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]//TR/TD/A[1]", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='search_begin']//TABLE[1]/*/TR/TD/A[1]", dom);
 		ArrayList<String> shows = new ArrayList<String>(nodes.size());
 		for (Node node : nodes) {
-			String href = HtmlUtil.selectString("@href", node);
+			String href = XPathUtil.selectString("@href", node);
-			String title = HtmlUtil.selectString("text()", node);
+			String title = XPathUtil.selectString("text()", node);
 			try {
 				URL url = new URL(href);
@ -69,18 +70,18 @@ public class TVRageSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
-		List<Node> nodes = HtmlUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//TABLE[@class='b']//TR[@id='brow']", dom);
 		ArrayList<Episode> episodes = new ArrayList<Episode>();
 		for (Node node : nodes) {
-			String seasonAndEpisodeNumber = HtmlUtil.selectString("./TD[2]/A/text()", node);
+			String seasonAndEpisodeNumber = XPathUtil.selectString("./TD[2]/A/text()", node);
-			String title = HtmlUtil.selectString("./TD[4]/A/text()", node);
+			String title = XPathUtil.selectString("./TD[4]/A/text()", node);
-			List<Node> precedings = HtmlUtil.selectNodes("../preceding-sibling::TABLE", node);
+			List<Node> precedings = XPathUtil.selectNodes("../preceding-sibling::TABLE", node);
 			Node previousTable = precedings.get(precedings.size() - 1);
-			String seasonHeader = HtmlUtil.selectString("./TR/TD/FONT/text()", previousTable);
+			String seasonHeader = XPathUtil.selectString("./TR/TD/FONT/text()", previousTable);
 			Matcher seasonMatcher = Pattern.compile("Season (\\d+)").matcher(seasonHeader);
--- a/source/net/sourceforge/filebot/web/TvdotcomSearchEngine.java
+++ b/source/net/sourceforge/filebot/web/TvdotcomSearchEngine.java
@ -16,6 +16,7 @@ import java.util.Map;
 import java.util.TreeMap;
 import net.sourceforge.filebot.resources.ResourceManager;
 import net.sourceforge.tuned.XPathUtil;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
@ -42,7 +43,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getSearchUrl(searchterm));
-		List<Node> nodes = HtmlUtil.selectNodes("//html:TABLE[@id='search-results']//html:SPAN/html:A", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//TABLE[@id='search-results']//SPAN/A", dom);
 		ArrayList<String> shows = new ArrayList<String>(nodes.size());
@ -52,7 +53,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
 			// we only want search results that are shows
 			if (category.toLowerCase().startsWith("show")) {
 				String title = node.getTextContent();
-				String href = HtmlUtil.selectString("@href", node);
+				String href = XPathUtil.selectString("@href", node);
 				try {
 					URL url = new URL(href);
@ -74,7 +75,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
 		Document dom = HtmlUtil.getHtmlDocument(getEpisodeListUrl(showname, season));
-		List<Node> nodes = HtmlUtil.selectNodes("//html:DIV[@id='episode-listing']/html:DIV/html:TABLE/html:TR/html:TD/ancestor::html:TR", dom);
+		List<Node> nodes = XPathUtil.selectNodes("//DIV[@id='episode-listing']/DIV/TABLE/TR/TD/ancestor::TR", dom);
 		String seasonString = null;
@ -93,8 +94,8 @@ public class TvdotcomSearchEngine extends SearchEngine {
 			episodeOffset = 0;
 		for (Node node : nodes) {
-			String episodeNumber = HtmlUtil.selectString("./html:TD[1]/text()", node);
+			String episodeNumber = XPathUtil.selectString("./TD[1]/text()", node);
-			String title = HtmlUtil.selectString("./html:TD[2]/html:A/text()", node);
+			String title = XPathUtil.selectString("./TD[2]/A/text()", node);
 			try {
 				// format number of episode
@ -105,7 +106,7 @@ public class TvdotcomSearchEngine extends SearchEngine {
 				episodeNumber = numberFormat.format(n - episodeOffset);
 			} catch (NumberFormatException e) {
-				// episode number can be "Pilot" or "Special"
+				// episode number may be "Pilot", "Special", etc.
 			}
 			episodes.add(new Episode(showname, seasonString, episodeNumber, title));
--- a/source/net/sourceforge/tuned/XPathUtil.java
+++ b/source/net/sourceforge/tuned/XPathUtil.java
@ -3,11 +3,8 @@ package net.sourceforge.tuned;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import javax.xml.XMLConstants;
 import javax.xml.namespace.NamespaceContext;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathFactory;
@ -18,9 +15,9 @@ import org.w3c.dom.NodeList;
 public class XPathUtil {
-	public static Node selectNode(String xpath, Object node, String namespacePrefix, String namespace) {
+	public static Node selectNode(String xpath, Object node) {
 		try {
-			XPath xp = createXPath(namespacePrefix, namespace);
+			XPath xp = XPathFactory.newInstance().newXPath();
 			return (Node) xp.evaluate(xpath, node, XPathConstants.NODE);
 		} catch (Exception e) {
@ -29,9 +26,9 @@ public class XPathUtil {
 	}
-	public static List<Node> selectNodes(String xpath, Object node, String namespacePrefix, String namespace) {
+	public static List<Node> selectNodes(String xpath, Object node) {
 		try {
-			XPath xp = createXPath(namespacePrefix, namespace);
+			XPath xp = XPathFactory.newInstance().newXPath();
 			NodeList nodeList = (NodeList) xp.evaluate(xpath, node, XPathConstants.NODESET);
@ -48,69 +45,13 @@ public class XPathUtil {
 	}
-	public static String selectString(String xpath, Object node, String namespacePrefix, String namespace) {
+	public static String selectString(String xpath, Object node) {
 		try {
-			XPath xp = createXPath(namespacePrefix, namespace);
+			XPath xp = XPathFactory.newInstance().newXPath();
-			return (String) xp.evaluate(xpath, node, XPathConstants.STRING);
+			return ((String) xp.evaluate(xpath, node, XPathConstants.STRING)).trim();
 		} catch (Exception e) {
 			throw new RuntimeException(e);
 		}
 	}
 	private static XPath createXPath(String namespacePrefix, String namespace) {
 		XPath xp = XPathFactory.newInstance().newXPath();
 		if (namespacePrefix != null && namespace != null) {
 			xp.setNamespaceContext(new NamespaceContextProvider(namespacePrefix, namespace));
 		}
 		return xp;
 	}
 	private static class NamespaceContextProvider implements NamespaceContext {
 		String boundPrefix;
 		String boundURI;
 		NamespaceContextProvider(String prefix, String URI) {
 			boundPrefix = prefix;
 			boundURI = URI;
 		}
 		public String getNamespaceURI(String prefix) {
 			if (prefix.equals(boundPrefix)) {
 				return boundURI;
 			} else if (prefix.equals(XMLConstants.XML_NS_PREFIX)) {
 				return XMLConstants.XML_NS_URI;
 			} else if (prefix.equals(XMLConstants.XMLNS_ATTRIBUTE)) {
 				return XMLConstants.XMLNS_ATTRIBUTE_NS_URI;
 			} else {
 				return XMLConstants.NULL_NS_URI;
 			}
 		}
 		public String getPrefix(String namespaceURI) {
 			if (namespaceURI.equals(boundURI)) {
 				return boundPrefix;
 			} else if (namespaceURI.equals(XMLConstants.XML_NS_URI)) {
 				return XMLConstants.XML_NS_PREFIX;
 			} else if (namespaceURI.equals(XMLConstants.XMLNS_ATTRIBUTE_NS_URI)) {
 				return XMLConstants.XMLNS_ATTRIBUTE;
 			} else {
 				return null;
 			}
 		}
 		@SuppressWarnings("unchecked")
 		public Iterator getPrefixes(String namespaceURI) {
 			return null;
 		}
 	}
 }