Sanitize HTML to remove meta refresh

Using
  <meta http-equiv="Refresh" content="1; URL=http://example.com/">
in a HTML message causes WebView to load the URL in the default browser.
Overriding WebViewClient.shouldOverrideUrlLoading() allows us to cancel
loading this URL. Sadly, I found no way to find out whether the method was
called because of a meta refresh or because the user clicked on a link.

So now we're using HtmlCleaner to parse the HTML and remove all "meta" elements
containing an "http-equiv" attribute with a value of "refresh".
This commit is contained in:
cketti 2015-01-09 16:16:00 +01:00
parent c7229e4724
commit 63abf05776
3 changed files with 153 additions and 1 deletions

View File

@ -0,0 +1,54 @@
package com.fsck.k9.helper;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.HtmlSerializer;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
public class HtmlSanitizer {
private static final HtmlCleaner HTML_CLEANER;
private static final HtmlSerializer HTML_SERIALIZER;
static {
CleanerProperties properties = createCleanerProperties();
HTML_CLEANER = new HtmlCleaner(properties);
HTML_SERIALIZER = new SimpleHtmlSerializer(properties);
}
private HtmlSanitizer() {}
public static String sanitize(String html) {
TagNode rootNode = HTML_CLEANER.clean(html);
removeMetaRefresh(rootNode);
return HTML_SERIALIZER.getAsString(rootNode, "UTF8");
}
private static CleanerProperties createCleanerProperties() {
CleanerProperties properties = new CleanerProperties();
// See http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
return properties;
}
private static void removeMetaRefresh(TagNode rootNode) {
for (TagNode element : rootNode.getElementListByName("meta", true)) {
String httpEquiv = element.getAttributeByName("http-equiv");
if (httpEquiv != null && httpEquiv.trim().equalsIgnoreCase("refresh")) {
element.removeFromTree();
}
}
}
}

View File

@ -11,6 +11,8 @@ import android.widget.Toast;
import com.fsck.k9.K9;
import com.fsck.k9.R;
import com.fsck.k9.helper.HtmlConverter;
import com.fsck.k9.helper.HtmlSanitizer;
public class MessageWebView extends RigidWebView {
@ -123,7 +125,9 @@ public class MessageWebView extends RigidWebView {
}
content += HtmlConverter.cssStylePre();
content += "</head><body>" + text + "</body></html>";
loadDataWithBaseURL("http://", content, "text/html", "utf-8", null);
String sanitizedContent = HtmlSanitizer.sanitize(content);
loadDataWithBaseURL("http://", sanitizedContent, "text/html", "utf-8", null);
resumeTimers();
}

View File

@ -0,0 +1,94 @@
package com.fsck.k9.helper;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class HtmlSanitizerTest {
@Test
public void shouldRemoveMetaRefreshInHead() {
String html = "<html>" +
"<head><meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\"></head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshBetweenHeadAndBody() {
String html = "<html>" +
"<head></head><meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\">" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshInBody() {
String html = "<html>" +
"<head></head>" +
"<body><meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\">Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshWithUpperCaseAttributeValue() {
String html = "<html>" +
"<head><meta http-equiv=\"REFRESH\" content=\"1; URL=http://example.com/\"></head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshWithMixedCaseAttributeValue() {
String html = "<html>" +
"<head><meta http-equiv=\"Refresh\" content=\"1; URL=http://example.com/\"></head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshWithoutQuotesAroundAttributeValue() {
String html = "<html>" +
"<head><meta http-equiv=refresh content=\"1; URL=http://example.com/\"></head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshWithSpacesInAttributeValue() {
String html = "<html>" +
"<head><meta http-equiv=\"refresh \" content=\"1; URL=http://example.com/\"></head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMultipleMetaRefreshTags() {
String html = "<html>" +
"<head><meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\"></head>" +
"<body><meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\">Message</body>" +
"</html>";
assertEquals("<html><head></head><body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
@Test
public void shouldRemoveMetaRefreshButKeepOtherMetaTags() {
String html = "<html>" +
"<head>" +
"<meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\">" +
"<meta http-equiv=\"refresh\" content=\"1; URL=http://example.com/\">" +
"</head>" +
"<body>Message</body>" +
"</html>";
assertEquals("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\" /></head>" +
"<body>Message</body></html>", HtmlSanitizer.sanitize(html));
}
}