Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit 7b105d7c authored by cketti's avatar cketti Committed by GitHub
Browse files

Merge pull request #3639 from k9mail/html_to_text

Use Jsoup to convert HTML to plain text
parents 300076c5 b1cfa302
Loading
Loading
Loading
Loading
+4 −85
Original line number Diff line number Diff line
package com.fsck.k9.message.html;


import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import android.text.Annotation;
import android.text.Editable;
import android.text.Html;
import android.text.Html.TagHandler;
import android.text.Spannable;
import android.text.Spanned;

import com.fsck.k9.K9;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.xml.sax.XMLReader;

/**
@@ -42,88 +37,12 @@ public class HtmlConverter {
     * @return Plain text result.
     */
    public static String htmlToText(final String html) {
        return Html.fromHtml(html, null, new HtmlToTextTagHandler()).toString()
        Document document = Jsoup.parse(html);
        return HtmlToPlainText.toPlainText(document.body())
               .replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
               .replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
    }

    /**
     * Custom tag handler to use when converting HTML messages to text. It currently handles text
     * representations of HTML tags that Android's built-in parser doesn't understand and hides code
     * contained in STYLE and SCRIPT blocks.
     */
    private static class HtmlToTextTagHandler implements Html.TagHandler {
        // List of tags whose content should be ignored.
        private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
        static {
            Set<String> set = new HashSet<>();
            set.add("style");
            set.add("script");
            set.add("title");
            set.add("!");   // comments
            TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
        }

        @Override
        public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
            tag = tag.toLowerCase(Locale.US);
            if (tag.equals("hr") && opening) {
                // In the case of an <hr>, replace it with a bunch of underscores. This is roughly
                // the behaviour of Outlook in Rich Text mode.
                output.append("_____________________________________________\r\n");
            } else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
                handleIgnoredTag(opening, output);
            }
        }

        private static final String IGNORED_ANNOTATION_KEY = "K9_ANNOTATION";
        private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";

        /**
         * When we come upon an ignored tag, we mark it with an Annotation object with a specific key
         * and value as above. We don't really need to be checking these values since Html.fromHtml()
         * doesn't use Annotation spans, but we should do it now to be safe in case they do start using
         * it in the future.
         * @param opening If this is an opening tag or not.
         * @param output Spannable string that we're working with.
         */
        private void handleIgnoredTag(boolean opening, Editable output) {
            int len = output.length();
            if (opening) {
                output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
                               len, Spannable.SPAN_MARK_MARK);
            } else {
                Object start = getOpeningAnnotation(output);
                if (start != null) {
                    int where = output.getSpanStart(start);
                    // Remove the temporary Annotation span.
                    output.removeSpan(start);
                    // Delete everything between the start of the Annotation and the end of the string
                    // (what we've generated so far).
                    output.delete(where, len);
                }
            }
        }

        /**
         * Fetch the matching opening Annotation object and verify that it's the one added by K9.
         * @param output Spannable string we're working with.
         * @return Starting Annotation object.
         */
        private Object getOpeningAnnotation(Editable output) {
            Object[] objs = output.getSpans(0, output.length(), Annotation.class);
            for (int i = objs.length - 1; i >= 0; i--) {
                Annotation span = (Annotation) objs[i];
                if (output.getSpanFlags(objs[i]) == Spannable.SPAN_MARK_MARK
                        && span.getKey().equals(IGNORED_ANNOTATION_KEY)
                        && span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
                    return objs[i];
                }
            }
            return null;
        }
    }

    /**
     * Convert a text string into an HTML document.
     *
+124 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.html

import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.select.NodeTraversor
import org.jsoup.select.NodeVisitor

/**
 * Convert an HTML element to plain text.
 *
 * Based on Jsoup's HtmlToPlainText example.
 */
object HtmlToPlainText {
    @JvmStatic
    fun toPlainText(element: Element): String {
        val formatter = FormattingVisitor()
        NodeTraversor.traverse(formatter, element)

        return formatter.toString()
    }
}

private class FormattingVisitor : NodeVisitor {
    private var width = 0
    private val output = StringBuilder()

    override fun head(node: Node, depth: Int) {
        val name = node.nodeName()
        when {
            node is TextNode -> append(node.text())
            name == "li" -> {
                startNewLine()
                append("* ")
            }
            node is Element && node.isBlock -> startNewLine()
        }
    }

    override fun tail(node: Node, depth: Int) {
        val name = node.nodeName()
        when {
            name == "li" -> append("\n")
            node is Element && node.isBlock -> {
                if (node.hasText()) {
                    addEmptyLine()
                }
            }
            name == "a" -> {
                if (node.absUrl("href").isNotEmpty()) {
                    append(" <${node.attr("href")}>")
                }
            }
        }
    }

    private fun append(text: String) {
        if (text.startsWith("\n")) {
            width = 0
        }

        if (text == " " && (output.isEmpty() || output.last() in listOf(' ', '\n'))) {
            return
        }

        if (text.length + width > MAX_WIDTH) {
            val words = text.split(Regex("\\s+"))
            for (i in words.indices) {
                var word = words[i]

                val last = i == words.size - 1
                if (!last) {
                    word = "$word "
                }

                if (word.length + width > MAX_WIDTH) {
                    output.append("\n").append(word)
                    width = word.length
                } else {
                    output.append(word)
                    width += word.length
                }
            }
        } else {
            output.append(text)
            width += text.length
        }
    }

    private fun startNewLine() {
        if (output.isEmpty() || output.last() == '\n') {
            return
        }

        append("\n")
    }

    private fun addEmptyLine() {
        if (output.isEmpty() || output.endsWith("\n\n")) {
            return
        }

        startNewLine()
        append("\n")
    }

    override fun toString(): String {
        if (output.isEmpty()) {
            return ""
        }

        var lastIndex = output.lastIndex
        while (lastIndex >= 0 && output[lastIndex] == '\n') {
            lastIndex--
        }

        return output.substring(0, lastIndex + 1)
    }


    companion object {
        private const val MAX_WIDTH = 76
    }
}