From ead632aac695cdd413faf036fe17a4ab9af89d1b Mon Sep 17 00:00:00 2001 From: Andrew Chen Date: Wed, 5 Jan 2011 04:26:33 +0000 Subject: [PATCH] Refactor html/text conversions into its own class. Add more tags to be ignored during HTML to plain text conversion. Remove UTF-8 non-breaking spaces from generated text. --- src/com/fsck/k9/helper/HtmlConverter.java | 251 ++++++++++++++++++ .../fsck/k9/helper/HtmlToTextTagHandler.java | 85 ------ src/com/fsck/k9/mail/store/LocalStore.java | 131 +-------- 3 files changed, 255 insertions(+), 212 deletions(-) create mode 100644 src/com/fsck/k9/helper/HtmlConverter.java delete mode 100644 src/com/fsck/k9/helper/HtmlToTextTagHandler.java diff --git a/src/com/fsck/k9/helper/HtmlConverter.java b/src/com/fsck/k9/helper/HtmlConverter.java new file mode 100644 index 000000000..c0975e03e --- /dev/null +++ b/src/com/fsck/k9/helper/HtmlConverter.java @@ -0,0 +1,251 @@ +package com.fsck.k9.helper; + +import android.text.Annotation; +import android.text.Editable; +import android.text.Html; +import android.text.Spannable; +import android.util.Log; +import com.fsck.k9.K9; +import org.xml.sax.XMLReader; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; + +/** + * Contains common routines to convert html to text and vice versa. + */ +public class HtmlConverter +{ + /** + * When generating previews, Spannable objects that can't be converted into a String are + * represented as 0xfffc. When displayed, these show up as undisplayed squares. These constants + * define the object character and the replacement character. + */ + private static final char PREVIEW_OBJECT_CHARACTER = (char)0xfffc; + private static final char PREVIEW_OBJECT_REPLACEMENT = (char)0x20; // space + + /** + * toHtml() converts non-breaking spaces into the UTF-8 non-breaking space, which doesn't get + * rendered properly in some clients. Replace it with a simple space. + */ + private static final char NBSP_CHARACTER = (char)0x00a0; // utf-8 non-breaking space + private static final char NBSP_REPLACEMENT = (char)0x20; // space + + /** + * Convert an HTML string to a plain text string. + * @param html HTML string to convert. + * @return Plain text result. + */ + public static String htmlToText(final String html) + { + return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString() + .replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT) + .replace(NBSP_CHARACTER, NBSP_REPLACEMENT); + } + + /** + * Custom tag handler to use when converting HTML messages to text. It currently handles text + * representations of HTML tags that Android's built-in parser doesn't understand and hides code + * contained in STYLE and SCRIPT blocks. + */ + private static class HtmlToTextTagHandler implements Html.TagHandler + { + // List of tags whose content should be ignored. + private static final Set TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(new HashSet() {{ + add("style"); + add("script"); + add("title"); + add("!"); // comments + }}); + + @Override + public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) + { + tag = tag.toLowerCase(); + if (tag.equals("hr") && opening) + { + // In the case of an
, replace it with a bunch of underscores. This is roughly + // the behaviour of Outlook in Rich Text mode. + output.append("_____________________________________________\n"); + } + else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) + { + handleIgnoredTag(opening, output); + } + } + + private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION"; + private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan"; + + /** + * When we come upon an ignored tag, we mark it with an Annotation object with a specific key + * and value as above. We don't really need to be checking these values since Html.fromHtml() + * doesn't use Annotation spans, but we should do it now to be safe in case they do start using + * it in the future. + * @param opening If this is an opening tag or not. + * @param output Spannable string that we're working with. + */ + private void handleIgnoredTag(boolean opening, Editable output) + { + int len = output.length(); + if (opening) + { + output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len, + len, Spannable.SPAN_MARK_MARK); + } + else + { + Object start = getOpeningAnnotation(output); + if (start != null) + { + int where = output.getSpanStart(start); + // Remove the temporary Annotation span. + output.removeSpan(start); + // Delete everything between the start of the Annotation and the end of the string + // (what we've generated so far). + output.delete(where, len); + } + } + } + + /** + * Fetch the matching opening Annotation object and verify that it's the one added by K9. + * @param output Spannable string we're working with. + * @return Starting Annotation object. + */ + private Object getOpeningAnnotation(Editable output) + { + Object[] objs = output.getSpans(0, output.length(), Annotation.class); + for (int i = objs.length - 1; i >= 0; i--) + { + Annotation span = (Annotation) objs[i]; + if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK + && span.getKey().equals(IGNORED_ANNOTATION_KEY) + && span.getValue().equals(IGNORED_ANNOTATION_VALUE)) + { + return objs[i]; + } + } + return null; + } + } + + private static final int MAX_SMART_HTMLIFY_MESSAGE_LENGTH = 1024 * 256 ; + + /** + * Convert a text string into an HTML document. Attempts to do smart replacement for large + * documents to prevent OOM errors. + * @param text Plain text string. + * @return HTML string. + */ + public static String textToHtml(String text) + { + // Our HTMLification code is somewhat memory intensive + // and was causing lots of OOM errors on the market + // if the message is big and plain text, just do + // a trivial htmlification + if (text.length() > MAX_SMART_HTMLIFY_MESSAGE_LENGTH) + { + return "" + + htmlifyMessageHeader() + + text + + htmlifyMessageFooter() + + ""; + } + StringReader reader = new StringReader(text); + StringBuilder buff = new StringBuilder(text.length() + 512); + int c; + try + { + while ((c = reader.read()) != -1) + { + switch (c) + { + case '&': + buff.append("&"); + break; + case '<': + buff.append("<"); + break; + case '>': + buff.append(">"); + break; + case '\r': + break; + default: + buff.append((char)c); + }//switch + } + } + catch (IOException e) + { + //Should never happen + Log.e(K9.LOG_TAG, "Could not read string to convert text to HTML:", e); + } + text = buff.toString(); + text = text.replaceAll("\\s*([-=_]{30,}+)\\s*","
"); + text = text.replaceAll("(?m)^([^\r\n]{4,}[\\s\\w,:;+/])(?:\r\n|\n|\r)(?=[a-z]\\S{0,10}[\\s\\n\\r])","$1 "); + text = text.replaceAll("(?m)(\r\n|\n|\r){4,}","\n\n"); + + Matcher m = Regex.WEB_URL_PATTERN.matcher(text); + StringBuffer sb = new StringBuffer(text.length() + 512); + sb.append(""); + sb.append(htmlifyMessageHeader()); + while (m.find()) + { + int start = m.start(); + if (start == 0 || (start != 0 && text.charAt(start - 1) != '@')) + { + if (m.group().indexOf(':') > 0) // With no URI-schema we may get "http:/" links with the second / missing + { + m.appendReplacement(sb, "$0"); + } + else + { + m.appendReplacement(sb, "$0"); + } + } + else + { + m.appendReplacement(sb, "$0"); + } + } + + m.appendTail(sb); + sb.append(htmlifyMessageFooter()); + sb.append(""); + text = sb.toString(); + + return text; + } + + private static String htmlifyMessageHeader() + { + if (K9.messageViewFixedWidthFont()) + { + return "
";
+        }
+        else
+        {
+            return "
"; + } + } + + private static String htmlifyMessageFooter() + { + if (K9.messageViewFixedWidthFont()) + { + return "
"; + } + else + { + return ""; + } + } + +} diff --git a/src/com/fsck/k9/helper/HtmlToTextTagHandler.java b/src/com/fsck/k9/helper/HtmlToTextTagHandler.java deleted file mode 100644 index 725bbc5a0..000000000 --- a/src/com/fsck/k9/helper/HtmlToTextTagHandler.java +++ /dev/null @@ -1,85 +0,0 @@ -package com.fsck.k9.helper; - -import android.text.Annotation; -import android.text.Editable; -import android.text.Html; -import android.text.Spannable; -import org.xml.sax.XMLReader; - -/** - * Custom tag handler to use when converting HTML messages to text. It currently handles text - * representations of HTML tags that Android's built-in parser doesn't understand and hides code - * contained in STYLE and SCRIPT blocks. - */ -public class HtmlToTextTagHandler implements Html.TagHandler -{ - @Override - public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) - { - if (tag.equalsIgnoreCase("hr") && opening) - { - // In the case of an
, replace it with a bunch of underscores. This is roughly the behaviour of - // Outlook in Rich Text mode. - output.append("_____________________________________________\n"); - } - else if (tag.equalsIgnoreCase("style") || tag.equalsIgnoreCase("script")) - { - handleIgnoredTag(opening, output); - } - } - - private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION"; - private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan"; - - /** - * When we come upon an ignored tag, we mark it with an Annotation object with a specific key - * and value as above. We don't really need to be checking these values since Html.fromHtml() - * doesn't use Annotation spans, but we should do it now to be safe in case they do start using - * it in the future. - * @param opening If this is an opening tag or not. - * @param output Spannable string that we're working with. - */ - private void handleIgnoredTag(boolean opening, Editable output) - { - int len = output.length(); - if (opening) - { - output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len, - len, Spannable.SPAN_MARK_MARK); - } - else - { - Object start = getOpeningAnnotation(output); - if (start != null) - { - int where = output.getSpanStart(start); - // Remove the temporary Annotation span. - output.removeSpan(start); - // Delete everything between the start of the Annotation and the end of the string - // (what we've generated so far). - output.delete(where, len); - } - } - } - - /** - * Fetch the matching opening Annotation object and verify that it's the one added by K9. - * @param output Spannable string we're working with. - * @return Starting Annotation object. - */ - private Object getOpeningAnnotation(Editable output) - { - Object[] objs = output.getSpans(0, output.length(), Annotation.class); - for (int i = objs.length - 1; i >= 0; i--) - { - Annotation span = (Annotation) objs[i]; - if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK - && span.getKey().equals(IGNORED_ANNOTATION_KEY) - && span.getValue().equals(IGNORED_ANNOTATION_VALUE)) - { - return objs[i]; - } - } - return null; - } -} diff --git a/src/com/fsck/k9/mail/store/LocalStore.java b/src/com/fsck/k9/mail/store/LocalStore.java index 990b9a40f..b5d0ea586 100644 --- a/src/com/fsck/k9/mail/store/LocalStore.java +++ b/src/com/fsck/k9/mail/store/LocalStore.java @@ -13,9 +13,8 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; -import java.util.regex.Matcher; -import com.fsck.k9.helper.HtmlToTextTagHandler; +import com.fsck.k9.helper.HtmlConverter; import org.apache.commons.io.IOUtils; import android.app.Application; @@ -26,7 +25,6 @@ import android.database.Cursor; import android.database.sqlite.SQLiteDatabase; import android.database.sqlite.SQLiteException; import android.net.Uri; -import android.text.Html; import android.util.Log; import com.fsck.k9.Account; @@ -34,7 +32,6 @@ import com.fsck.k9.K9; import com.fsck.k9.Preferences; import com.fsck.k9.controller.MessageRemovalListener; import com.fsck.k9.controller.MessageRetrievalListener; -import com.fsck.k9.helper.Regex; import com.fsck.k9.helper.Utility; import com.fsck.k9.mail.Address; import com.fsck.k9.mail.Body; @@ -76,8 +73,6 @@ public class LocalStore extends Store implements Serializable private static final Flag[] PERMANENT_FLAGS = { Flag.DELETED, Flag.X_DESTROYED, Flag.SEEN, Flag.FLAGGED }; - private static final int MAX_SMART_HTMLIFY_MESSAGE_LENGTH = 1024 * 256 ; - private static Set HEADERS_TO_SAVE = new HashSet(); static { @@ -99,14 +94,6 @@ public class LocalStore extends Store implements Serializable "subject, sender_list, date, uid, flags, id, to_list, cc_list, " + "bcc_list, reply_to_list, attachment_count, internal_date, message_id, folder_id, preview "; - /** - * When generating previews, Spannable objects that can't be converted into a String are - * represented as 0xfffc. When displayed, these show up as undisplayed squares. These constants - * define the object character and the replacement character. - */ - private static final char PREVIEW_OBJECT_CHARACTER = (char)0xfffc; - private static final char PREVIEW_OBJECT_REPLACEMENT = (char)0x20; // space - protected static final int DB_VERSION = 39; protected String uUid = null; @@ -2370,7 +2357,7 @@ public class LocalStore extends Store implements Serializable // If we couldn't generate a reasonable preview from the text part, try doing it with the HTML part. if (preview == null || preview.length() == 0) { - preview = calculateContentPreview(Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString().replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)); + preview = calculateContentPreview(HtmlConverter.htmlToText(html)); } try @@ -2498,7 +2485,7 @@ public class LocalStore extends Store implements Serializable // If we couldn't generate a reasonable preview from the text part, try doing it with the HTML part. if (preview == null || preview.length() == 0) { - preview = calculateContentPreview(Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString().replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)); + preview = calculateContentPreview(HtmlConverter.htmlToText(html)); } try { @@ -3140,7 +3127,7 @@ public class LocalStore extends Store implements Serializable { if (text.length() > 0 && html.length() == 0) { - html = htmlifyString(text); + html = HtmlConverter.textToHtml(text); } html = convertEmoji2Img(html); @@ -3148,116 +3135,6 @@ public class LocalStore extends Store implements Serializable return html; } - public String htmlifyString(String text) - { - // Our HTMLification code is somewhat memory intensive - // and was causing lots of OOM errors on the market - // if the message is big and plain text, just do - // a trivial htmlification - if (text.length() > MAX_SMART_HTMLIFY_MESSAGE_LENGTH) - { - return "" + - htmlifyMessageHeader() + - text + - htmlifyMessageFooter() + - ""; - } - StringReader reader = new StringReader(text); - StringBuilder buff = new StringBuilder(text.length() + 512); - int c; - try - { - while ((c = reader.read()) != -1) - { - switch (c) - { - case '&': - buff.append("&"); - break; - case '<': - buff.append("<"); - break; - case '>': - buff.append(">"); - break; - case '\r': - break; - default: - buff.append((char)c); - }//switch - } - } - catch (IOException e) - { - //Should never happen - Log.e(K9.LOG_TAG, null, e); - } - text = buff.toString(); - text = text.replaceAll("\\s*([-=_]{30,}+)\\s*","
"); - text = text.replaceAll("(?m)^([^\r\n]{4,}[\\s\\w,:;+/])(?:\r\n|\n|\r)(?=[a-z]\\S{0,10}[\\s\\n\\r])","$1 "); - text = text.replaceAll("(?m)(\r\n|\n|\r){4,}","\n\n"); - - - Matcher m = Regex.WEB_URL_PATTERN.matcher(text); - StringBuffer sb = new StringBuffer(text.length() + 512); - sb.append(""); - sb.append(htmlifyMessageHeader()); - while (m.find()) - { - int start = m.start(); - if (start == 0 || (start != 0 && text.charAt(start - 1) != '@')) - { - if (m.group().indexOf(':') > 0) // With no URI-schema we may get "http:/" links with the second / missing - { - m.appendReplacement(sb, "$0"); - } - else - { - m.appendReplacement(sb, "$0"); - } - } - else - { - m.appendReplacement(sb, "$0"); - } - } - - - - - m.appendTail(sb); - sb.append(htmlifyMessageFooter()); - sb.append(""); - text = sb.toString(); - - return text; - } - - private String htmlifyMessageHeader() - { - if (K9.messageViewFixedWidthFont()) - { - return "
";
-            }
-            else
-            {
-                return "
"; - } - } - - - private String htmlifyMessageFooter() - { - if (K9.messageViewFixedWidthFont()) - { - return "
"; - } - else - { - return ""; - } - } - public String convertEmoji2Img(String html) { StringBuilder buff = new StringBuilder(html.length() + 512);