filebot/source/net/sourceforge/filebot/web/WebRequest.java

113 lines
3.3 KiB
Java

package net.sourceforge.filebot.web;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public final class WebRequest {
public static Document getHtmlDocument(URL url) throws IOException, SAXException {
return getHtmlDocument(url.openConnection());
}
public static Document getHtmlDocument(URLConnection connection) throws IOException, SAXException {
return getHtmlDocument(getReader(connection));
}
public static Reader getReader(URLConnection connection) throws IOException {
try {
connection.addRequestProperty("Accept-Encoding", "gzip,deflate");
} catch (IllegalStateException e) {
// too bad, can't request gzipped document anymore
}
Charset charset = getCharset(connection.getContentType());
String encoding = connection.getContentEncoding();
InputStream inputStream = connection.getInputStream();
if ("gzip".equalsIgnoreCase(encoding))
inputStream = new GZIPInputStream(inputStream);
else if ("deflate".equalsIgnoreCase(encoding)) {
inputStream = new InflaterInputStream(inputStream, new Inflater(true));
}
return new InputStreamReader(inputStream, charset);
}
public static Document getHtmlDocument(Reader reader) throws SAXException, IOException {
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(new InputSource(reader));
return parser.getDocument();
}
public static Document getDocument(URL url) throws SAXException, IOException, ParserConfigurationException {
return getDocument(url.toString());
}
public static Document getDocument(String url) throws SAXException, IOException, ParserConfigurationException {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(url);
}
public static Document getDocument(InputStream inputStream) throws SAXException, IOException, ParserConfigurationException {
return DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(inputStream);
}
private static Charset getCharset(String contentType) {
if (contentType != null) {
// e.g. Content-Type: text/html; charset=iso-8859-1
Pattern pattern = Pattern.compile(".*;\\s*charset=(\\S+).*", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(contentType);
if (matcher.matches()) {
String charsetName = matcher.group(1);
try {
return Charset.forName(charsetName);
} catch (Exception e) {
Logger.getLogger(WebRequest.class.getName()).log(Level.WARNING, e.getMessage());
}
}
}
// use UTF-8 if charset cannot be determined
return Charset.forName("UTF-8");
}
private WebRequest() {
throw new UnsupportedOperationException();
}
}