64 lines
1.7 KiB
Java
64 lines
1.7 KiB
Java
|
|
package net.sourceforge.filebot.web;
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.Reader;
|
|
import java.net.URL;
|
|
import java.net.URLConnection;
|
|
import java.nio.charset.Charset;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
import org.cyberneko.html.parsers.DOMParser;
|
|
import org.w3c.dom.Document;
|
|
import org.xml.sax.InputSource;
|
|
import org.xml.sax.SAXException;
|
|
|
|
|
|
public class HtmlUtil {
|
|
|
|
private static Charset getCharset(String contentType) {
|
|
if (contentType != null) {
|
|
// e.g. Content-Type: text/html; charset=iso-8859-1
|
|
Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE);
|
|
Matcher matcher = pattern.matcher(contentType);
|
|
|
|
if (matcher.matches()) {
|
|
String charsetName = matcher.group(1);
|
|
return Charset.forName(charsetName);
|
|
}
|
|
}
|
|
|
|
// use UTF-8 if charset cannot be determined
|
|
return Charset.forName("UTF-8");
|
|
}
|
|
|
|
|
|
public static Document getHtmlDocument(URL url) throws IOException, SAXException {
|
|
URLConnection connection = url.openConnection();
|
|
|
|
Charset charset = getCharset(connection.getContentType());
|
|
String encoding = connection.getContentEncoding();
|
|
InputStream inputStream = connection.getInputStream();
|
|
|
|
if (encoding != null && encoding.equalsIgnoreCase("gzip"))
|
|
inputStream = new GZIPInputStream(inputStream);
|
|
|
|
return getHtmlDocument(new InputStreamReader(inputStream, charset));
|
|
}
|
|
|
|
|
|
public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
|
|
DOMParser parser = new DOMParser();
|
|
parser.setFeature("http://xml.org/sax/features/namespaces", false);
|
|
parser.parse(new InputSource(reader));
|
|
|
|
return parser.getDocument();
|
|
}
|
|
|
|
}
|