Refactor html/text conversions into its own class.

Add more tags to be ignored during HTML to plain text conversion. Remove UTF-8 non-breaking spaces from generated text.
2025-03-03 01:51:49 -05:00 · 2011-01-05 04:26:33 +00:00 · 2011-01-05 04:26:33 +00:00 · ead632aac6
commit ead632aac6
parent f8f916edde
3 changed files with 255 additions and 212 deletions
--- a/src/com/fsck/k9/helper/HtmlConverter.java
+++ b/src/com/fsck/k9/helper/HtmlConverter.java
@ -0,0 +1,251 @@
+package com.fsck.k9.helper;
+
+import android.text.Annotation;
+import android.text.Editable;
+import android.text.Html;
+import android.text.Spannable;
+import android.util.Log;
+import com.fsck.k9.K9;
+import org.xml.sax.XMLReader;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+
+/**
+ * Contains common routines to convert html to text and vice versa.
+ */
+public class HtmlConverter
+{
+    /**
+     * When generating previews, Spannable objects that can't be converted into a String are
+     * represented as 0xfffc. When displayed, these show up as undisplayed squares. These constants
+     * define the object character and the replacement character.
+     */
+    private static final char PREVIEW_OBJECT_CHARACTER = (char)0xfffc;
+    private static final char PREVIEW_OBJECT_REPLACEMENT = (char)0x20;  // space
+
+    /**
+     * toHtml() converts non-breaking spaces into the UTF-8 non-breaking space, which doesn't get
+     * rendered properly in some clients. Replace it with a simple space.
+     */
+    private static final char NBSP_CHARACTER = (char)0x00a0;    // utf-8 non-breaking space
+    private static final char NBSP_REPLACEMENT = (char)0x20;    // space
+
+    /**
+     * Convert an HTML string to a plain text string.
+     * @param html HTML string to convert.
+     * @return Plain text result.
+     */
+    public static String htmlToText(final String html)
+    {
+        return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
+            .replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
+            .replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
+    }
+
+    /**
+     * Custom tag handler to use when converting HTML messages to text. It currently handles text
+     * representations of HTML tags that Android's built-in parser doesn't understand and hides code
+     * contained in STYLE and SCRIPT blocks.
+     */
+    private static class HtmlToTextTagHandler implements Html.TagHandler
+    {
+        // List of tags whose content should be ignored.
+        private static final Set<String> TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(new HashSet<String>() {{
+            add("style");
+            add("script");
+            add("title");
+            add("!");   // comments
+        }});
+
+        @Override
+        public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader)
+        {
+            tag = tag.toLowerCase();
+            if (tag.equals("hr") && opening)
+            {
+                // In the case of an <hr>, replace it with a bunch of underscores. This is roughly
+                // the behaviour of Outlook in Rich Text mode.
+                output.append("_____________________________________________\n");
+            }
+            else if (TAGS_WITH_IGNORED_CONTENT.contains(tag))
+            {
+                handleIgnoredTag(opening, output);
+            }
+        }
+
+        private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
+        private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
+
+        /**
+         * When we come upon an ignored tag, we mark it with an Annotation object with a specific key
+         * and value as above. We don't really need to be checking these values since Html.fromHtml()
+         * doesn't use Annotation spans, but we should do it now to be safe in case they do start using
+         * it in the future.
+         * @param opening If this is an opening tag or not.
+         * @param output Spannable string that we're working with.
+         */
+        private void handleIgnoredTag(boolean opening, Editable output)
+        {
+            int len = output.length();
+            if (opening)
+            {
+                output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
+                        len, Spannable.SPAN_MARK_MARK);
+            }
+            else
+            {
+                Object start = getOpeningAnnotation(output);
+                if (start != null)
+                {
+                    int where = output.getSpanStart(start);
+                    // Remove the temporary Annotation span.
+                    output.removeSpan(start);
+                    // Delete everything between the start of the Annotation and the end of the string
+                    // (what we've generated so far).
+                    output.delete(where, len);
+                }
+            }
+        }
+
+        /**
+         * Fetch the matching opening Annotation object and verify that it's the one added by K9.
+         * @param output Spannable string we're working with.
+         * @return Starting Annotation object.
+         */
+        private Object getOpeningAnnotation(Editable output)
+        {
+            Object[] objs = output.getSpans(0, output.length(), Annotation.class);
+            for (int i = objs.length - 1; i >= 0; i--)
+            {
+                Annotation span = (Annotation) objs[i];
+                if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
+                        && span.getKey().equals(IGNORED_ANNOTATION_KEY)
+                        && span.getValue().equals(IGNORED_ANNOTATION_VALUE))
+                {
+                    return objs[i];
+                }
+            }
+            return null;
+        }
+    }
+
+    private static final int MAX_SMART_HTMLIFY_MESSAGE_LENGTH = 1024 * 256 ;
+
+    /**
+     * Convert a text string into an HTML document. Attempts to do smart replacement for large
+     * documents to prevent OOM errors.
+     * @param text Plain text string.
+     * @return HTML string.
+     */
+    public static String textToHtml(String text)
+    {
+        // Our HTMLification code is somewhat memory intensive
+        // and was causing lots of OOM errors on the market
+        // if the message is big and plain text, just do
+        // a trivial htmlification
+        if (text.length() > MAX_SMART_HTMLIFY_MESSAGE_LENGTH)
+        {
+            return "<html><head/><body>" +
+                   htmlifyMessageHeader() +
+                   text +
+                   htmlifyMessageFooter() +
+                   "</body></html>";
+        }
+        StringReader reader = new StringReader(text);
+        StringBuilder buff = new StringBuilder(text.length() + 512);
+        int c;
+        try
+        {
+            while ((c = reader.read()) != -1)
+            {
+                switch (c)
+                {
+                    case '&':
+                        buff.append("&amp;");
+                        break;
+                    case '<':
+                        buff.append("&lt;");
+                        break;
+                    case '>':
+                        buff.append("&gt;");
+                        break;
+                    case '\r':
+                        break;
+                    default:
+                        buff.append((char)c);
+                }//switch
+            }
+        }
+        catch (IOException e)
+        {
+            //Should never happen
+            Log.e(K9.LOG_TAG, "Could not read string to convert text to HTML:", e);
+        }
+        text = buff.toString();
+        text = text.replaceAll("\\s*([-=_]{30,}+)\\s*","<hr />");
+        text = text.replaceAll("(?m)^([^\r\n]{4,}[\\s\\w,:;+/])(?:\r\n|\n|\r)(?=[a-z]\\S{0,10}[\\s\\n\\r])","$1 ");
+        text = text.replaceAll("(?m)(\r\n|\n|\r){4,}","\n\n");
+
+        Matcher m = Regex.WEB_URL_PATTERN.matcher(text);
+        StringBuffer sb = new StringBuffer(text.length() + 512);
+        sb.append("<html><head></head><body>");
+        sb.append(htmlifyMessageHeader());
+        while (m.find())
+        {
+            int start = m.start();
+            if (start == 0 || (start != 0 && text.charAt(start - 1) != '@'))
+            {
+                if (m.group().indexOf(':') > 0)   // With no URI-schema we may get "http:/" links with the second / missing
+                {
+                    m.appendReplacement(sb, "<a href=\"$0\">$0</a>");
+                }
+                else
+                {
+                    m.appendReplacement(sb, "<a href=\"http://$0\">$0</a>");
+                }
+            }
+            else
+            {
+                m.appendReplacement(sb, "$0");
+            }
+        }
+
+        m.appendTail(sb);
+        sb.append(htmlifyMessageFooter());
+        sb.append("</body></html>");
+        text = sb.toString();
+
+        return text;
+    }
+
+    private static String htmlifyMessageHeader()
+    {
+        if (K9.messageViewFixedWidthFont())
+        {
+            return "<pre style=\"white-space: pre-wrap; word-wrap:break-word; \">";
+        }
+        else
+        {
+            return "<div style=\"white-space: pre-wrap; word-wrap:break-word; \">";
+        }
+    }
+
+    private static String htmlifyMessageFooter()
+    {
+        if (K9.messageViewFixedWidthFont())
+        {
+            return "</pre>";
+        }
+        else
+        {
+            return "</div>";
+        }
+    }
+
+}
--- a/src/com/fsck/k9/helper/HtmlToTextTagHandler.java
+++ b/src/com/fsck/k9/helper/HtmlToTextTagHandler.java
@ -1,85 +0,0 @@
-package com.fsck.k9.helper;
-
-import android.text.Annotation;
-import android.text.Editable;
-import android.text.Html;
-import android.text.Spannable;
-import org.xml.sax.XMLReader;
-
-/**
- * Custom tag handler to use when converting HTML messages to text. It currently handles text
- * representations of HTML tags that Android's built-in parser doesn't understand and hides code
- * contained in STYLE and SCRIPT blocks.
- */
-public class HtmlToTextTagHandler implements Html.TagHandler
-{
-    @Override
-    public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader)
-    {
-        if (tag.equalsIgnoreCase("hr") && opening)
-        {
-            // In the case of an <hr>, replace it with a bunch of underscores.  This is roughly the behaviour of
-            // Outlook in Rich Text mode.
-            output.append("_____________________________________________\n");
-        }
-        else if (tag.equalsIgnoreCase("style") || tag.equalsIgnoreCase("script"))
-        {
-            handleIgnoredTag(opening, output);
-        }
-    }
-
-    private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
-    private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
-
-    /**
-     * When we come upon an ignored tag, we mark it with an Annotation object with a specific key
-     * and value as above. We don't really need to be checking these values since Html.fromHtml()
-     * doesn't use Annotation spans, but we should do it now to be safe in case they do start using
-     * it in the future.
-     * @param opening If this is an opening tag or not.
-     * @param output Spannable string that we're working with.
-     */
-    private void handleIgnoredTag(boolean opening, Editable output)
-    {
-        int len = output.length();
-        if (opening)
-        {
-            output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
-                    len, Spannable.SPAN_MARK_MARK);
-        }
-        else
-        {
-            Object start = getOpeningAnnotation(output);
-            if (start != null)
-            {
-                int where = output.getSpanStart(start);
-                // Remove the temporary Annotation span.
-                output.removeSpan(start);
-                // Delete everything between the start of the Annotation and the end of the string
-                // (what we've generated so far).
-                output.delete(where, len);
-            }
-        }
-    }
-
-    /**
-     * Fetch the matching opening Annotation object and verify that it's the one added by K9.
-     * @param output Spannable string we're working with.
-     * @return Starting Annotation object.
-     */
-    private Object getOpeningAnnotation(Editable output)
-    {
-        Object[] objs = output.getSpans(0, output.length(), Annotation.class);
-        for (int i = objs.length - 1; i >= 0; i--)
-        {
-            Annotation span = (Annotation) objs[i];
-            if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
-                    && span.getKey().equals(IGNORED_ANNOTATION_KEY)
-                    && span.getValue().equals(IGNORED_ANNOTATION_VALUE))
-            {
-                return objs[i];
-            }
-        }
-        return null;
-    }
-}
--- a/src/com/fsck/k9/mail/store/LocalStore.java
+++ b/src/com/fsck/k9/mail/store/LocalStore.java
@ -13,9 +13,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
-import java.util.regex.Matcher;

-import com.fsck.k9.helper.HtmlToTextTagHandler;
+import com.fsck.k9.helper.HtmlConverter;
 import org.apache.commons.io.IOUtils;

 import android.app.Application;
@ -26,7 +25,6 @@ import android.database.Cursor;
 import android.database.sqlite.SQLiteDatabase;
 import android.database.sqlite.SQLiteException;
 import android.net.Uri;
-import android.text.Html;
 import android.util.Log;

 import com.fsck.k9.Account;
@ -34,7 +32,6 @@ import com.fsck.k9.K9;
 import com.fsck.k9.Preferences;
 import com.fsck.k9.controller.MessageRemovalListener;
 import com.fsck.k9.controller.MessageRetrievalListener;
-import com.fsck.k9.helper.Regex;
 import com.fsck.k9.helper.Utility;
 import com.fsck.k9.mail.Address;
 import com.fsck.k9.mail.Body;
@ -76,8 +73,6 @@ public class LocalStore extends Store implements Serializable

    private static final Flag[] PERMANENT_FLAGS = { Flag.DELETED, Flag.X_DESTROYED, Flag.SEEN, Flag.FLAGGED };

-    private static final int MAX_SMART_HTMLIFY_MESSAGE_LENGTH = 1024 * 256 ;
-
    private static Set<String> HEADERS_TO_SAVE = new HashSet<String>();
    static
    {
@ -99,14 +94,6 @@ public class LocalStore extends Store implements Serializable
        "subject, sender_list, date, uid, flags, id, to_list, cc_list, "
        + "bcc_list, reply_to_list, attachment_count, internal_date, message_id, folder_id, preview ";

-    /**
-     * When generating previews, Spannable objects that can't be converted into a String are
-     * represented as 0xfffc. When displayed, these show up as undisplayed squares. These constants
-     * define the object character and the replacement character.
-     */
-    private static final char PREVIEW_OBJECT_CHARACTER = (char)0xfffc;
-    private static final char PREVIEW_OBJECT_REPLACEMENT = (char)0x20;  // space
-
    protected static final int DB_VERSION = 39;

    protected String uUid = null;
@ -2370,7 +2357,7 @@ public class LocalStore extends Store implements Serializable
                                // If we couldn't generate a reasonable preview from the text part, try doing it with the HTML part.
                                if (preview == null || preview.length() == 0)
                                {
-                                    preview = calculateContentPreview(Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString().replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT));
+                                    preview = calculateContentPreview(HtmlConverter.htmlToText(html));
                                }

                                try
@ -2498,7 +2485,7 @@ public class LocalStore extends Store implements Serializable
                            // If we couldn't generate a reasonable preview from the text part, try doing it with the HTML part.
                            if (preview == null || preview.length() == 0)
                            {
-                                preview = calculateContentPreview(Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString().replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT));
+                                preview = calculateContentPreview(HtmlConverter.htmlToText(html));
                            }
                            try
                            {
@ -3140,7 +3127,7 @@ public class LocalStore extends Store implements Serializable
        {
            if (text.length() > 0 && html.length() == 0)
            {
-                html = htmlifyString(text);
+                html = HtmlConverter.textToHtml(text);
            }

            html = convertEmoji2Img(html);
@ -3148,116 +3135,6 @@ public class LocalStore extends Store implements Serializable
            return html;
        }

-        public String htmlifyString(String text)
-        {
-            // Our HTMLification code is somewhat memory intensive
-            // and was causing lots of OOM errors on the market
-            // if the message is big and plain text, just do
-            // a trivial htmlification
-            if (text.length() > MAX_SMART_HTMLIFY_MESSAGE_LENGTH)
-            {
-                return "<html><head/><body>" +
-                       htmlifyMessageHeader() +
-                       text +
-                       htmlifyMessageFooter() +
-                       "</body></html>";
-            }
-            StringReader reader = new StringReader(text);
-            StringBuilder buff = new StringBuilder(text.length() + 512);
-            int c;
-            try
-            {
-                while ((c = reader.read()) != -1)
-                {
-                    switch (c)
-                    {
-                        case '&':
-                            buff.append("&amp;");
-                            break;
-                        case '<':
-                            buff.append("&lt;");
-                            break;
-                        case '>':
-                            buff.append("&gt;");
-                            break;
-                        case '\r':
-                            break;
-                        default:
-                            buff.append((char)c);
-                    }//switch
-                }
-            }
-            catch (IOException e)
-            {
-                //Should never happen
-                Log.e(K9.LOG_TAG, null, e);
-            }
-            text = buff.toString();
-            text = text.replaceAll("\\s*([-=_]{30,}+)\\s*","<hr />");
-            text = text.replaceAll("(?m)^([^\r\n]{4,}[\\s\\w,:;+/])(?:\r\n|\n|\r)(?=[a-z]\\S{0,10}[\\s\\n\\r])","$1 ");
-            text = text.replaceAll("(?m)(\r\n|\n|\r){4,}","\n\n");
-
-
-            Matcher m = Regex.WEB_URL_PATTERN.matcher(text);
-            StringBuffer sb = new StringBuffer(text.length() + 512);
-            sb.append("<html><head></head><body>");
-            sb.append(htmlifyMessageHeader());
-            while (m.find())
-            {
-                int start = m.start();
-                if (start == 0 || (start != 0 && text.charAt(start - 1) != '@'))
-                {
-                    if (m.group().indexOf(':') > 0)   // With no URI-schema we may get "http:/" links with the second / missing
-                    {
-                        m.appendReplacement(sb, "<a href=\"$0\">$0</a>");
-                    }
-                    else
-                    {
-                        m.appendReplacement(sb, "<a href=\"http://$0\">$0</a>");
-                    }
-                }
-                else
-                {
-                    m.appendReplacement(sb, "$0");
-                }
-            }
-
-
-
-
-            m.appendTail(sb);
-            sb.append(htmlifyMessageFooter());
-            sb.append("</body></html>");
-            text = sb.toString();
-
-            return text;
-        }
-
-        private String htmlifyMessageHeader()
-        {
-            if (K9.messageViewFixedWidthFont())
-            {
-                return "<pre style=\"white-space: pre-wrap; word-wrap:break-word; \">";
-            }
-            else
-            {
-                return "<div style=\"white-space: pre-wrap; word-wrap:break-word; \">";
-            }
-        }
-
-
-        private String htmlifyMessageFooter()
-        {
-            if (K9.messageViewFixedWidthFont())
-            {
-                return "</pre>";
-            }
-            else
-            {
-                return "</div>";
-            }
-        }
-
        public String convertEmoji2Img(String html)
        {
            StringBuilder buff = new StringBuilder(html.length() + 512);