Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit 65b57387 authored by cketti's avatar cketti Committed by GitHub
Browse files

Merge pull request #6280 from thundernest/fix_HtmlSignatureRemover

Fix `HtmlSignatureRemover`
parents a5f0572c 2fe28987
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -125,7 +125,7 @@ public class AdvancedNodeTraversor {
            Node prev = node;
            node = node.nextSibling();

            if (headResult == HeadFilterDecision.REMOVE) {
            if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
                prev.remove();
            }

+0 −106
Original line number Diff line number Diff line
package com.fsck.k9.message.signature;


import java.util.regex.Pattern;

import androidx.annotation.NonNull;
import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor;
import com.fsck.k9.helper.jsoup.NodeFilter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;


public class HtmlSignatureRemover {
    public static String stripSignature(String content) {
        return new HtmlSignatureRemover().stripSignatureInternal(content);
    }

    private String stripSignatureInternal(String content) {
        Document document = Jsoup.parse(content);

        AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter());
        nodeTraversor.filter(document.body());

        return toCompactString(document);
    }

    private String toCompactString(Document document) {
        document.outputSettings()
                .prettyPrint(false)
                .indentAmount(0);

        return document.html();
    }


    static class StripSignatureFilter implements NodeFilter {
        private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE);
        private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote");
        private static final Tag BR = Tag.valueOf("br");
        private static final Tag P = Tag.valueOf("p");


        private boolean signatureFound = false;
        private boolean lastElementCausedLineBreak = false;
        private Element brElementPrecedingDashes;


        @NonNull
        @Override
        public HeadFilterDecision head(Node node, int depth) {
            if (signatureFound) {
                return HeadFilterDecision.REMOVE;
            }

            if (node instanceof Element) {
                lastElementCausedLineBreak = false;

                Element element = (Element) node;
                if (element.tag().equals(BLOCKQUOTE)) {
                    return HeadFilterDecision.SKIP_ENTIRELY;
                }
            } else if (node instanceof TextNode) {
                TextNode textNode = (TextNode) node;
                if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
                    Node nextNode = node.nextSibling();
                    if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
                        signatureFound = true;
                        if (brElementPrecedingDashes != null) {
                            brElementPrecedingDashes.remove();
                            brElementPrecedingDashes = null;
                        }

                        return HeadFilterDecision.REMOVE;
                    }
                }
            }

            return HeadFilterDecision.CONTINUE;
        }

        @NonNull
        @Override
        public TailFilterDecision tail(Node node, int depth) {
            if (signatureFound) {
                return TailFilterDecision.CONTINUE;
            }

            if (node instanceof Element) {
                Element element = (Element) node;
                boolean elementIsBr = element.tag().equals(BR);
                if (elementIsBr || element.tag().equals(P)) {
                    lastElementCausedLineBreak = true;
                    brElementPrecedingDashes = elementIsBr ? element : null;
                    return TailFilterDecision.CONTINUE;
                }
            }

            lastElementCausedLineBreak = false;
            return TailFilterDecision.CONTINUE;
        }
    }
}
+142 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.signature

import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor
import com.fsck.k9.helper.jsoup.NodeFilter
import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision
import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision
import java.util.Stack
import java.util.regex.Pattern
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.parser.Tag

class HtmlSignatureRemover {
    private fun stripSignatureInternal(content: String): String {
        val document = Jsoup.parse(content)

        val nodeTraversor = AdvancedNodeTraversor(StripSignatureFilter())
        nodeTraversor.filter(document.body())

        return toCompactString(document)
    }

    private fun toCompactString(document: Document): String {
        document.outputSettings()
            .prettyPrint(false)
            .indentAmount(0)

        return document.html()
    }

    private class StripSignatureFilter : NodeFilter {
        private var signatureFound = false
        private var signatureParentNode: Node? = null

        override fun head(node: Node, depth: Int): HeadFilterDecision {
            if (signatureFound) return HeadFilterDecision.REMOVE

            if (node.isBlockquote()) {
                return HeadFilterDecision.SKIP_ENTIRELY
            } else if (node.isSignatureDelimiter()) {
                val precedingLineBreak = node.findPrecedingLineBreak()
                if (precedingLineBreak != null && node.isFollowedByLineBreak()) {
                    signatureFound = true
                    signatureParentNode = node.parent()
                    precedingLineBreak.takeIf { it.isBR() }?.remove()

                    return HeadFilterDecision.REMOVE
                }
            }

            return HeadFilterDecision.CONTINUE
        }

        override fun tail(node: Node, depth: Int): TailFilterDecision {
            if (signatureFound) {
                val signatureParentNode = this.signatureParentNode
                if (node == signatureParentNode) {
                    return if (signatureParentNode.isEmpty()) {
                        this.signatureParentNode = signatureParentNode.parent()
                        TailFilterDecision.REMOVE
                    } else {
                        TailFilterDecision.STOP
                    }
                }
            }

            return TailFilterDecision.CONTINUE
        }

        private fun Node.isBlockquote(): Boolean {
            return this is Element && tag() == BLOCKQUOTE
        }

        private fun Node.isSignatureDelimiter(): Boolean {
            return this is TextNode && DASH_SIGNATURE_HTML.matcher(wholeText).matches()
        }

        private fun Node.findPrecedingLineBreak(): Node? {
            val stack = Stack<Node>()
            stack.push(this)

            while (stack.isNotEmpty()) {
                val node = stack.pop()
                val previousSibling = node.previousSibling()
                if (previousSibling == null) {
                    val parent = node.parent()
                    if (parent is Element && parent.isBlock) {
                        return parent
                    } else {
                        stack.push(parent)
                    }
                } else if (previousSibling.isLineBreak()) {
                    return previousSibling
                }
            }

            return null
        }

        private fun Node.isFollowedByLineBreak(): Boolean {
            val stack = Stack<Node>()
            stack.push(this)

            while (stack.isNotEmpty()) {
                val node = stack.pop()
                val nextSibling = node.nextSibling()
                if (nextSibling == null) {
                    val parent = node.parent()
                    if (parent is Element && parent.isBlock) {
                        return true
                    } else {
                        stack.push(parent)
                    }
                } else if (nextSibling.isLineBreak()) {
                    return true
                }
            }

            return false
        }

        private fun Node?.isBR() = this is Element && tag() == BR

        private fun Node?.isLineBreak() = isBR() || (this is Element && this.isBlock)

        private fun Node.isEmpty(): Boolean = childNodeSize() == 0
    }

    companion object {
        private val DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE)
        private val BLOCKQUOTE = Tag.valueOf("blockquote")
        private val BR = Tag.valueOf("br")

        @JvmStatic
        fun stripSignature(content: String): String {
            return HtmlSignatureRemover().stripSignatureInternal(content)
        }
    }
}
+0 −141
Original line number Diff line number Diff line
package com.fsck.k9.message.signature;


import org.junit.Test;

import static com.fsck.k9.message.html.HtmlHelper.extractText;
import static org.junit.Assert.assertEquals;


public class HtmlSignatureRemoverTest {
    @Test
    public void shouldStripSignatureFromK9StyleHtml() throws Exception {
        String html = "This is the body text" +
                "<br>" +
                "-- <br>" +
                "Sent from my Android device with K-9 Mail. Please excuse my brevity.";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("This is the body text", extractText(withoutSignature));
    }

    @Test
    public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception {
        String html = "<html>\r\n" +
                "  <head>\r\n" +
                "    <meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\r\n" +
                "  </head>\r\n" +
                "  <body bgcolor=\"#FFFFFF\" text=\"#000000\">\r\n" +
                "    <p>This is the body text<br>\r\n" +
                "    </p>\r\n" +
                "    -- <br>\r\n" +
                "    <div class=\"moz-signature\">Sent from my Android device with K-9 Mail." +
                " Please excuse my brevity.</div>\r\n" +
                "  </body>\r\n" +
                "</html>";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("This is the body text", extractText(withoutSignature));
    }

    @Test
    public void shouldStripSignatureBeforeBlockquoteTag() throws Exception {
        String html = "<html><head></head><body>" +
                "<div>" +
                "This is the body text" +
                "<br>" +
                "-- <br>" +
                "<blockquote>" +
                "Sent from my Android device with K-9 Mail. Please excuse my brevity." +
                "</blockquote>" +
                "</div>" +
                "</body></html>";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("<html><head></head><body>" +
                        "<div>This is the body text</div>" +
                        "</body></html>",
                withoutSignature);
    }

    @Test
    public void shouldNotStripSignatureInsideBlockquoteTags() throws Exception {
        String html = "<html><head></head><body>" +
                "<blockquote>" +
                "This is some quoted text" +
                "<br>" +
                "-- <br>" +
                "Inner signature" +
                "</blockquote>" +
                "<div>" +
                "This is the body text" +
                "</div>" +
                "</body></html>";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("<html><head></head><body>" +
                        "<blockquote>" +
                        "This is some quoted text" +
                        "<br>" +
                        "-- <br>" +
                        "Inner signature" +
                        "</blockquote>" +
                        "<div>This is the body text</div>" +
                        "</body></html>",
                withoutSignature);
    }

    @Test
    public void shouldStripSignatureBetweenBlockquoteTags() throws Exception {
        String html = "<html><head></head><body>" +
                "<blockquote>" +
                "Some quote" +
                "</blockquote>" +
                "<div>" +
                "This is the body text" +
                "<br>" +
                "-- <br>" +
                "<blockquote>" +
                "Sent from my Android device with K-9 Mail. Please excuse my brevity." +
                "</blockquote>" +
                "<br>" +
                "-- <br>" +
                "Signature inside signature" +
                "</div>" +
                "</body></html>";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("<html><head></head><body>" +
                        "<blockquote>Some quote</blockquote>" +
                        "<div>This is the body text</div>" +
                        "</body></html>",
                withoutSignature);
    }

    @Test
    public void shouldStripSignatureAfterLastBlockquoteTags() throws Exception {
        String html = "<html><head></head><body>" +
                "This is the body text" +
                "<br>" +
                "<blockquote>" +
                "Some quote" +
                "</blockquote>" +
                "<br>" +
                "-- <br>" +
                "Sent from my Android device with K-9 Mail. Please excuse my brevity." +
                "</body></html>";

        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("<html><head></head><body>" +
                        "This is the body text<br>" +
                        "<blockquote>Some quote</blockquote>" +
                        "</body></html>",
                withoutSignature);
    }
}
+184 −0
Original line number Diff line number Diff line
package com.fsck.k9.message.signature

import com.fsck.k9.message.html.HtmlHelper.extractText
import com.fsck.k9.message.signature.HtmlSignatureRemover.Companion.stripSignature
import com.fsck.k9.testing.removeNewlines
import com.google.common.truth.Truth.assertThat
import org.junit.Test

class HtmlSignatureRemoverTest {
    @Test
    fun `old K-9 Mail signature format`() {
        val html =
            """This is the body text<br>-- <br>Sent from my Android device with K-9 Mail. Please excuse my brevity."""

        val withoutSignature = stripSignature(html)

        assertThat(extractText(withoutSignature)).isEqualTo("This is the body text")
    }

    @Test
    fun `old Thunderbird signature format`() {
        val html =
            """
            <html>
              <head>
                <meta http-equiv="content-type" content="text/html; charset=utf-8">
              </head>
              <body bgcolor="#FFFFFF" text="#000000">
                <p>This is the body text<br>
                </p>
                -- <br>
                <div class="moz-signature">Sent from my Android device with K-9 Mail. Please excuse my brevity.</div>
              </body>
            </html>
            """.trimIndent()

        val withoutSignature = stripSignature(html)

        assertThat(extractText(withoutSignature)).isEqualTo("This is the body text")
    }

    @Test
    fun `signature before blockquote tag`() {
        val html =
            """
            <html>
            <head></head>
            <body>
            <div>
            This is the body text<br>
            -- <br>
            <blockquote>Sent from my Android device with K-9 Mail. Please excuse my brevity.</blockquote>
            </div>
            </body>
            </html>
            """.trimIndent().removeNewlines()

        val withoutSignature = stripSignature(html)

        assertThat(withoutSignature).isEqualTo(
            """<html><head></head><body><div>This is the body text</div></body></html>"""
        )
    }

    @Test
    fun `should not strip signature inside blockquote tag`() {
        val html =
            """
            <html>
            <head></head>
            <body>
            <blockquote>
            This is some quoted text<br>
            -- <br>
            Inner signature
            </blockquote>
            <div>
            This is the body text
            </div>
            </body>
            </html>
            """.trimIndent().removeNewlines()

        val withoutSignature = stripSignature(html)

        assertThat(withoutSignature).isEqualTo(html)
    }

    @Test
    fun `signature between blockquote tags`() {
        val html =
            """
            <html>
            <head></head>
            <body>
            <blockquote>Some quote</blockquote>
            <div>This is the body text<br>
            -- <br>
            <blockquote>Sent from my Android device with K-9 Mail. Please excuse my brevity.</blockquote>
            <br>-- <br>Signature inside signature
            </div>
            </body>
            </html>
            """.trimIndent().removeNewlines()

        val withoutSignature = stripSignature(html)

        assertThat(withoutSignature).isEqualTo(
            """
            <html>
            <head></head>
            <body>
            <blockquote>Some quote</blockquote>
            <div>This is the body text</div>
            </body>
            </html>
            """.trimIndent().removeNewlines()
        )
    }

    @Test
    fun `signature after last blockquote tag`() {
        val html =
            """
            <html>
            <head></head>
            <body>
            This is the body text<br>
            <blockquote>Some quote</blockquote>
            <br>
            -- <br>
            Sent from my Android device with K-9 Mail. Please excuse my brevity.
            </body>
            </html>
            """.trimIndent().removeNewlines()

        val withoutSignature = stripSignature(html)

        assertThat(withoutSignature).isEqualTo(
            """
            <html>
            <head></head>
            <body>
            This is the body text<br>
            <blockquote>Some quote</blockquote>
            </body>
            </html>
            """.trimIndent().removeNewlines()
        )
    }

    @Test
    fun `K-9 Mail signature format`() {
        val html =
            """
            <!DOCTYPE html>
            <html>
            <body>
            This is the body text.<br>
            <br>
            <div class='k9mail-signature'>
            -- <br>
            And this is the signature text.
            </div>
            </body>
            </html>
            """.trimIndent().removeNewlines()

        val withoutSignature = stripSignature(html)

        assertThat(withoutSignature).isEqualTo(
            """
            <!doctype html>
            <html>
            <head></head>
            <body>
            This is the body text.<br>
            <br>
            </body>
            </html>
            """.trimIndent().removeNewlines()
        )
    }
}
Loading