Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b1cfa302 authored by cketti's avatar cketti
Browse files

Use different method to convert HTML to plain text

Html.fromHtml() exhibited some serious performance issues with certain
inputs.

See issue #3624
parent 300076c5
Loading
Loading
Loading
Loading
+4 −85
Original line number Diff line number Diff line
package com.fsck.k9.message.html;


import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import android.text.Annotation;
import android.text.Editable;
import android.text.Html;
import android.text.Html.TagHandler;
import android.text.Spannable;
import android.text.Spanned;

import com.fsck.k9.K9;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.xml.sax.XMLReader;

/**
@@ -42,88 +37,12 @@ public class HtmlConverter {
     * @return Plain text result.
     */
    public static String htmlToText(final String html) {
        return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
        Document document = Jsoup.parse(html);
        return HtmlToPlainText.toPlainText(document.body())
               .replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
               .replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
    }

    /**
     * Custom tag handler to use when converting HTML messages to text. It currently handles text
     * representations of HTML tags that Android's built-in parser doesn't understand and hides code
     * contained in STYLE and SCRIPT blocks.
     */
    private static class HtmlToTextTagHandler implements Html.TagHandler {
        // List of tags whose content should be ignored.
        private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
        static {
            Set<String> set = new HashSet<>();
            set.add("style");
            set.add("script");
            set.add("title");
            set.add("!");   // comments
            TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
        }

        @Override
        public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
            tag = tag.toLowerCase(Locale.US);
            if (tag.equals("hr") && opening) {
                // In the case of an <hr>, replace it with a bunch of underscores. This is roughly
                // the behaviour of Outlook in Rich Text mode.
                output.append("_____________________________________________\r\n");
            } else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
                handleIgnoredTag(opening, output);
            }
        }

        private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
        private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";

        /**
         * When we come upon an ignored tag, we mark it with an Annotation object with a specific key
         * and value as above. We don't really need to be checking these values since Html.fromHtml()
         * doesn't use Annotation spans, but we should do it now to be safe in case they do start using
         * it in the future.
         * @param opening If this is an opening tag or not.
         * @param output Spannable string that we're working with.
         */
        private void handleIgnoredTag(boolean opening, Editable output) {
            int len = output.length();
            if (opening) {
                output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
                               len, Spannable.SPAN_MARK_MARK);
            } else {
                Object start = getOpeningAnnotation(output);
                if (start != null) {
                    int where = output.getSpanStart(start);
                    // Remove the temporary Annotation span.
                    output.removeSpan(start);
                    // Delete everything between the start of the Annotation and the end of the string
                    // (what we've generated so far).
                    output.delete(where, len);
                }
            }
        }

        /**
         * Fetch the matching opening Annotation object and verify that it's the one added by K9.
         * @param output Spannable string we're working with.
         * @return Starting Annotation object.
         */
        private Object getOpeningAnnotation(Editable output) {
            Object[] objs = output.getSpans(0, output.length(), Annotation.class);
            for (int i = objs.length - 1; i >= 0; i--) {
                Annotation span = (Annotation) objs[i];
                if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
                        && span.getKey().equals(IGNORED_ANNOTATION_KEY)
                        && span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
                    return objs[i];
                }
            }
            return null;
        }
    }

    /**
     * Convert a text string into an HTML document.
     *
+124 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.html

import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.select.NodeTraversor
import org.jsoup.select.NodeVisitor

/**
 * Convert an HTML element to plain text.
 *
 * Based on Jsoup's HtmlToPlainText example.
 */
object HtmlToPlainText {
    @JvmStatic
    fun toPlainText(element: Element): String {
        val formatter = FormattingVisitor()
        NodeTraversor.traverse(formatter, element)

        return formatter.toString()
    }
}

private class FormattingVisitor : NodeVisitor {
    private var width = 0
    private val output = StringBuilder()

    override fun head(node: Node, depth: Int) {
        val name = node.nodeName()
        when {
            node is TextNode -> append(node.text())
            name == "li" -> {
                startNewLine()
                append("* ")
            }
            node is Element && node.isBlock -> startNewLine()
        }
    }

    override fun tail(node: Node, depth: Int) {
        val name = node.nodeName()
        when {
            name == "li" -> append("\n")
            node is Element && node.isBlock -> {
                if (node.hasText()) {
                    addEmptyLine()
                }
            }
            name == "a" -> {
                if (node.absUrl("href").isNotEmpty()) {
                    append(" <${node.attr("href")}>")
                }
            }
        }
    }

    private fun append(text: String) {
        if (text.startsWith("\n")) {
            width = 0
        }

        if (text == " " && (output.isEmpty() || output.last() in listOf(' ', '\n'))) {
            return
        }

        if (text.length + width > MAX_WIDTH) {
            val words = text.split(Regex("\\s+"))
            for (i in words.indices) {
                var word = words[i]

                val last = i == words.size - 1
                if (!last) {
                    word = "$word "
                }

                if (word.length + width > MAX_WIDTH) {
                    output.append("\n").append(word)
                    width = word.length
                } else {
                    output.append(word)
                    width += word.length
                }
            }
        } else {
            output.append(text)
            width += text.length
        }
    }

    private fun startNewLine() {
        if (output.isEmpty() || output.last() == '\n') {
            return
        }

        append("\n")
    }

    private fun addEmptyLine() {
        if (output.isEmpty() || output.endsWith("\n\n")) {
            return
        }

        startNewLine()
        append("\n")
    }

    override fun toString(): String {
        if (output.isEmpty()) {
            return ""
        }

        var lastIndex = output.lastIndex
        while (lastIndex >= 0 && output[lastIndex] == '\n') {
            lastIndex--
        }

        return output.substring(0, lastIndex + 1)
    }


    companion object {
        private const val MAX_WIDTH = 76
    }
}