Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit 8dd8881a authored by cketti's avatar cketti Committed by GitHub
Browse files

Merge pull request #4967 from k9mail/improve_preview_extraction

Improve preview text extraction
parents 908f7758 a1e9a521
Loading
Loading
Loading
Loading
+0 −69
Original line number Diff line number Diff line
package com.fsck.k9.message.extractors;


import androidx.annotation.NonNull;

import com.fsck.k9.message.html.HtmlConverter;
import com.fsck.k9.mail.Part;
import com.fsck.k9.mail.internet.MessageExtractor;

import static com.fsck.k9.mail.internet.MimeUtility.isSameMimeType;


class PreviewTextExtractor {
    private static final int MAX_PREVIEW_LENGTH = 512;
    private static final int MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192;


    @NonNull
    public String extractPreview(@NonNull Part textPart) throws PreviewExtractionException {
        String text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW);
        if (text == null) {
            throw new PreviewExtractionException("Couldn't get text from part");
        }

        String plainText = convertFromHtmlIfNecessary(textPart, text);

        return stripTextForPreview(plainText);
    }

    private String convertFromHtmlIfNecessary(Part textPart, String text) {
        String mimeType = textPart.getMimeType();
        if (!isSameMimeType(mimeType, "text/html")) {
            return text;
        }

        return HtmlConverter.htmlToText(text);
    }

    private String stripTextForPreview(String text) {
        if (text == null) {
            return "";
        }

        // Remove (correctly delimited by '-- \n') signatures
        text = text.replaceAll("(?ms)^-- [\\r\\n]+.*", "");
        // try to remove lines of dashes in the preview
        text = text.replaceAll("(?m)^----.*?$", "");
        // remove quoted text from the preview
        text = text.replaceAll("(?m)^[#>].*$", "");
        // Remove a common quote header from the preview
        text = text.replaceAll("(?m)^On .*wrote.?$", "");
        // Remove a more generic quote header from the preview
        text = text.replaceAll("(?m)^.*\\w+:$", "");
        // Remove horizontal rules.
        text = text.replaceAll("\\s*([-=_]{30,}+)\\s*", " ");

        // URLs in the preview should just be shown as "..." - They're not
        // clickable and they usually overwhelm the preview
        text = text.replaceAll("https?://\\S+", "...");
        // Don't show newlines in the preview
        text = text.replaceAll("(\\r|\\n)+", " ");
        // Collapse whitespace in the preview
        text = text.replaceAll("\\s+", " ");
        // Remove any whitespace at the beginning and end of the string.
        text = text.trim();

        return (text.length() > MAX_PREVIEW_LENGTH) ? text.substring(0, MAX_PREVIEW_LENGTH - 1) + "…" : text;
    }
}
+126 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.extractors

import com.fsck.k9.mail.Part
import com.fsck.k9.mail.internet.MessageExtractor
import com.fsck.k9.mail.internet.MimeUtility.isSameMimeType
import com.fsck.k9.message.html.EmailSection
import com.fsck.k9.message.html.EmailSectionExtractor
import com.fsck.k9.message.html.HtmlConverter

internal class PreviewTextExtractor {
    @Throws(PreviewExtractionException::class)
    fun extractPreview(textPart: Part): String {
        val text = MessageExtractor.getTextFromPart(textPart, MAX_CHARACTERS_CHECKED_FOR_PREVIEW)
            ?: throw PreviewExtractionException("Couldn't get text from part")

        val plainText = convertFromHtmlIfNecessary(textPart, text)
        return stripTextForPreview(plainText)
    }

    private fun convertFromHtmlIfNecessary(textPart: Part, text: String): String {
        return if (isSameMimeType(textPart.mimeType, "text/html")) {
            HtmlConverter.htmlToText(text)
        } else {
            text
        }
    }

    private fun stripTextForPreview(text: String): String {
        var intermediateText = text

        intermediateText = normalizeLineBreaks(intermediateText)
        intermediateText = stripSignature(intermediateText)
        intermediateText = extractUnquotedText(intermediateText)

        // try to remove lines of dashes in the preview
        intermediateText = intermediateText.replace("(?m)^----.*?$".toRegex(), "")
        // Remove horizontal rules.
        intermediateText = intermediateText.replace("\\s*([-=_]{30,}+)\\s*".toRegex(), " ")

        // URLs in the preview should just be shown as "..." - They're not
        // clickable and they usually overwhelm the preview
        intermediateText = intermediateText.replace("https?://\\S+".toRegex(), "...")
        // Don't show newlines in the preview
        intermediateText = intermediateText.replace('\n', ' ')
        // Collapse whitespace in the preview
        intermediateText = intermediateText.replace("\\s+".toRegex(), " ")
        // Remove any whitespace at the beginning and end of the string.
        intermediateText = intermediateText.trim()

        return if (intermediateText.length > MAX_PREVIEW_LENGTH) {
            intermediateText.substring(0, MAX_PREVIEW_LENGTH - 1) + "…"
        } else {
            intermediateText
        }
    }

    private fun normalizeLineBreaks(text: String) = text.replace(REGEX_CRLF, "\n")

    private fun stripSignature(text: String): String {
        return if (text.startsWith("-- \n")) {
            ""
        } else {
            text.substringBefore("\n-- \n")
        }
    }

    private fun extractUnquotedText(text: String): String {
        val emailSections = EmailSectionExtractor.extract(text)
        if (emailSections.isEmpty()) {
            return ""
        }

        val firstEmailSection = emailSections.first()
        val replySections = if (firstEmailSection.quoteDepth == 0) {
            val replyEmailSections = emailSections.drop(1).filter { it.quoteDepth == 0 && it.isNotBlank() }
            if (firstEmailSection.isQuoteHeaderOnly()) {
                replyEmailSections
            } else {
                val firstSectionTextWithoutQuoteHeader = stripQuoteHeader(firstEmailSection)
                listOf(firstSectionTextWithoutQuoteHeader) + replyEmailSections
            }
        } else {
            emailSections.filter { it.quoteDepth == 0 && it.isNotBlank() }
        }

        return replySections.joinToString(separator = " […] ")
    }

    private fun stripQuoteHeader(emailSection: EmailSection): String {
        val quoteHeaderIndex = emailSection.quoteHeaderIndex
        if (quoteHeaderIndex == -1) return emailSection.toString()
        return emailSection.substring(startIndex = 0, endIndex = quoteHeaderIndex)
    }

    private fun EmailSection.isQuoteHeaderOnly(): Boolean {
        return quoteHeaderIndex == 0
    }

    private val EmailSection.quoteHeaderIndex: Int
        get() {
            var quoteHeaderIndex = lastIndex
            while (quoteHeaderIndex > 0 && this[quoteHeaderIndex] == '\n') {
                quoteHeaderIndex--
            }
            if (this[quoteHeaderIndex] != ':') return -1

            var newlineCount = 0
            while (quoteHeaderIndex > 0) {
                when {
                    this[quoteHeaderIndex] == '\n' -> newlineCount++
                    newlineCount > 1 -> return quoteHeaderIndex + 1
                    else -> newlineCount = 0
                }
                quoteHeaderIndex--
            }

            return 0
        }

    companion object {
        private const val MAX_PREVIEW_LENGTH = 512
        private const val MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192L

        private val REGEX_CRLF = "(\\r\\n|\\r)".toRegex()
    }
}
+0 −155
Original line number Diff line number Diff line
package com.fsck.k9.message.extractors;


import com.fsck.k9.RobolectricTest;
import com.fsck.k9.mail.Part;
import com.fsck.k9.mail.internet.MimeBodyPart;
import org.junit.Before;
import org.junit.Test;

import static com.fsck.k9.message.MessageCreationHelper.createTextPart;
import static org.junit.Assert.assertEquals;


public class PreviewTextExtractorTest extends RobolectricTest {
    private PreviewTextExtractor previewTextExtractor;


    @Before
    public void setUp() throws Exception {
        previewTextExtractor = new PreviewTextExtractor();
    }

    @Test(expected = PreviewExtractionException.class)
    public void extractPreview_withEmptyBody_shouldThrow() throws Exception {
        Part part = new MimeBodyPart(null, "text/plain");

        previewTextExtractor.extractPreview(part);
    }

    @Test
    public void extractPreview_withSimpleTextPlain() throws Exception {
        String text = "The quick brown fox jumps over the lazy dog";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals(text, preview);
    }

    @Test
    public void extractPreview_withSimpleTextHtml() throws Exception {
        String text = "<b>The quick brown fox jumps over the lazy dog</b>";
        Part part = createTextPart("text/html", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("The quick brown fox jumps over the lazy dog", preview);
    }

    @Test
    public void extractPreview_withLongTextPlain() throws Exception {
        String text = "" +
                "10--------20--------30--------40--------50--------" +
                "60--------70--------80--------90--------100-------" +
                "110-------120-------130-------140-------150-------" +
                "160-------170-------180-------190-------200-------" +
                "210-------220-------230-------240-------250-------" +
                "260-------270-------280-------290-------300-------" +
                "310-------320-------330-------340-------350-------" +
                "360-------370-------380-------390-------400-------" +
                "410-------420-------430-------440-------450-------" +
                "460-------470-------480-------490-------500-------" +
                "510-------520-------";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals(text.substring(0, 511) + "…", preview);
    }

    @Test
    public void extractPreview_shouldStripSignature() throws Exception {
        String text = "" +
                "Some text\r\n" +
                "-- \r\n" +
                "Signature";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("Some text", preview);
    }

    @Test
    public void extractPreview_shouldStripHorizontalLine() throws Exception {
        String text = "" +
                "line 1\r\n" +
                "----\r\n" +
                "line 2";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("line 1 line 2", preview);
    }

    @Test
    public void extractPreview_shouldStripQuoteHeaderAndQuotedText() throws Exception {
        String text = "" +
                "some text\r\n" +
                "On 01/02/03 someone wrote\r\n" +
                "> some quoted text\r\n" +
                "# some other quoted text\r\n";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("some text", preview);
    }

    @Test
    public void extractPreview_shouldStripGenericQuoteHeader() throws Exception {
        String text = "" +
                "Am 13.12.2015 um 23:42 schrieb Hans:\r\n" +
                "> hallo\r\n" +
                "hi there\r\n";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("hi there", preview);
    }

    @Test
    public void extractPreview_shouldStripHorizontalRules() throws Exception {
        String text = "line 1" +
                "------------------------------\r\n" +
                "line 2";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("line 1 line 2", preview);
    }

    @Test
    public void extractPreview_shouldReplaceUrl() throws Exception {
        String text = "some url: https://k9mail.org/";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("some url: ...", preview);
    }

    @Test
    public void extractPreview_shouldCollapseAndTrimWhitespace() throws Exception {
        String text = " whitespace     is\t\tfun  ";
        Part part = createTextPart("text/plain", text);

        String preview = previewTextExtractor.extractPreview(part);

        assertEquals("whitespace is fun", preview);
    }
}
+200 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.extractors

import com.fsck.k9.mail.internet.MimeBodyPart
import com.fsck.k9.message.MessageCreationHelper
import com.google.common.truth.Truth.assertThat
import org.junit.Test

class PreviewTextExtractorTest {
    private val previewTextExtractor = PreviewTextExtractor()

    @Test(expected = PreviewExtractionException::class)
    fun extractPreview_withEmptyBody_shouldThrow() {
        val part = MimeBodyPart(null, "text/plain")

        previewTextExtractor.extractPreview(part)
    }

    @Test
    fun extractPreview_withSimpleTextPlain() {
        val text = "The quick brown fox jumps over the lazy dog"
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo(text)
    }

    @Test
    fun extractPreview_withSimpleTextHtml() {
        val text = "<b>The quick brown fox jumps over the lazy dog</b>"
        val part = MessageCreationHelper.createTextPart("text/html", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("The quick brown fox jumps over the lazy dog")
    }

    @Test
    fun extractPreview_withLongTextPlain() {
        val text = "" +
            "10--------20--------30--------40--------50--------" +
            "60--------70--------80--------90--------100-------" +
            "110-------120-------130-------140-------150-------" +
            "160-------170-------180-------190-------200-------" +
            "210-------220-------230-------240-------250-------" +
            "260-------270-------280-------290-------300-------" +
            "310-------320-------330-------340-------350-------" +
            "360-------370-------380-------390-------400-------" +
            "410-------420-------430-------440-------450-------" +
            "460-------470-------480-------490-------500-------" +
            "510-------520-------"
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo(text.substring(0, 511) + "…")
    }

    @Test
    fun extractPreview_shouldStripSignature() {
        val text = """
            Some text
            -- 
            Signature
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("Some text")
    }

    @Test
    fun extractPreview_shouldStripHorizontalLine() {
        val text = """
            line 1
            ----
            line 2
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("line 1 line 2")
    }

    @Test
    fun extractPreview_shouldStripQuoteHeaderAndQuotedText() {
        val text = """
            some text
            
            On 01/02/03 someone wrote:
            > some quoted text
            > some other quoted text
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("some text")
    }

    @Test
    fun extractPreview_shouldStripGenericQuoteHeader() {
        val text = """
            Am 13.12.2015 um 23:42 schrieb Hans:
            > hallo
            hi there
            
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("hi there")
    }

    @Test
    fun extractPreview_shouldStripHorizontalRules() {
        val text = """
            line 1------------------------------
            line 2
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("line 1 line 2")
    }

    @Test
    fun extractPreview_shouldReplaceUrl() {
        val text = "some url: https://k9mail.org/"
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("some url: ...")
    }

    @Test
    fun extractPreview_shouldCollapseAndTrimWhitespace() {
        val text = " whitespace     is\t\tfun  "
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("whitespace is fun")
    }

    @Test
    fun extractPreview_lineEndingWithColon() {
        val text = """
            Here's a list:
            - item 1
            - item 2
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("Here's a list: - item 1 - item 2")
    }

    @Test
    fun extractPreview_inlineReplies() {
        val text = """
            On 2020-09-30 at 03:12 Bob wrote:
            > Hi Alice
            Hi Bob
            
            > How are you?
            I'm fine. Thanks for asking.
            
            > Bye
            See you tomorrow
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("Hi Bob […] I'm fine. Thanks for asking. […] See you tomorrow")
    }

    @Test
    fun extractPreview_quoteHeaderContainingLineBreak() {
        val text = """
            Reply text
            
            On 2020-09-30 at 03:12
            Bob wrote:
            > Quoted text
            """.trimIndent()
        val part = MessageCreationHelper.createTextPart("text/plain", text)

        val preview = previewTextExtractor.extractPreview(part)

        assertThat(preview).isEqualTo("Reply text")
    }
}