filebot/source/net/sourceforge/filebot/web/HtmlUtil.java

64 lines
1.7 KiB
Java

package net.sourceforge.filebot.web;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class HtmlUtil {
private static Charset getCharset(String contentType) {
if (contentType != null) {
// e.g. Content-Type: text/html; charset=iso-8859-1
Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(contentType);
if (matcher.matches()) {
String charsetName = matcher.group(1);
return Charset.forName(charsetName);
}
}
// use UTF-8 if charset cannot be determined
return Charset.forName("UTF-8");
}
public static Document getHtmlDocument(URL url) throws IOException, SAXException {
URLConnection connection = url.openConnection();
Charset charset = getCharset(connection.getContentType());
String encoding = connection.getContentEncoding();
InputStream inputStream = connection.getInputStream();
if (encoding != null && encoding.equalsIgnoreCase("gzip"))
inputStream = new GZIPInputStream(inputStream);
return getHtmlDocument(new InputStreamReader(inputStream, charset));
}
public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(reader));
return parser.getDocument();
}
}