Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6d06b332 authored by cketti's avatar cketti
Browse files

Use jsoup in HtmlSignatureRemover

parent 268189c1
Loading
Loading
Loading
Loading
+139 −0
Original line number Diff line number Diff line
/*
 * The MIT License
 *
 * © 2009-2017, Jonathan Hedley <jonathan@hedley.net>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package com.fsck.k9.helper.jsoup;


import com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision;
import com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeTraversor;


/**
 * Depth-first node traversor.
 * <p>
 * Based on {@link NodeTraversor}, but supports skipping sub trees, removing nodes, and stopping the traversal at any
 * point.
 * </p><p>
 * This is an enhancement of the <a href="https://github.com/jhy/jsoup/pull/849">jsoup pull request 'Improved node
 * traversal'</a> by <a href="https://github.com/kno10">Erich Schubert</a>.
 * </p>
 */
public class AdvancedNodeTraversor {
    /**
     * Filter result.
     */
    public enum FilterResult {
        /**
         * Processing the tree was completed.
         */
        ENDED,
        /**
         * Processing was stopped.
         */
        STOPPED,
        /**
         * Processing the tree was completed and the root node was removed.
         */
        ROOT_REMOVED
    }

    private NodeFilter filter;

    /**
     * Create a new traversor.
     *
     * @param filter
     *         a class implementing the {@link NodeFilter} interface, to be called when visiting each node.
     */
    public AdvancedNodeTraversor(NodeFilter filter) {
        this.filter = filter;
    }

    /**
     * Start a depth-first filtering of the root and all of its descendants.
     *
     * @param root
     *         the root node point to traverse.
     *
     * @return The result of the filter operation.
     */
    public FilterResult filter(Node root) {
        Node node = root;
        int depth = 0;

        while (node != null) {
            HeadFilterDecision headResult = filter.head(node, depth);
            if (headResult == HeadFilterDecision.STOP) {
                return FilterResult.STOPPED;
            }

            if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) {
                node = node.childNode(0);
                ++depth;
                continue;
            }

            TailFilterDecision tailResult = TailFilterDecision.CONTINUE;
            while (node.nextSibling() == null && depth > 0) {
                if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
                    tailResult = filter.tail(node, depth);
                    if (tailResult == TailFilterDecision.STOP) {
                        return FilterResult.STOPPED;
                    }
                }

                Node prev = node;
                node = node.parentNode();
                depth--;

                if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
                    prev.remove();
                }

                headResult = HeadFilterDecision.CONTINUE;
            }

            if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
                tailResult = filter.tail(node, depth);
                if (tailResult == TailFilterDecision.STOP) {
                    return FilterResult.STOPPED;
                }
            }

            Node prev = node;
            node = node.nextSibling();

            if (headResult == HeadFilterDecision.REMOVE) {
                prev.remove();
            }

            if (prev == root) {
                return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED;
            }
        }

        return FilterResult.ENDED;
    }
}
+111 −0
Original line number Diff line number Diff line
package com.fsck.k9.helper.jsoup;


import android.support.annotation.NonNull;

import org.jsoup.nodes.Node;


/**
 * Node filter interface. Provide an implementing class to {@link AdvancedNodeTraversor} to iterate through
 * nodes.
 * <p>
 * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
 * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
 * create a start tag for a node, and tail to create the end tag.
 * </p>
 * <p>
 * For every node, the filter has to decide in {@link NodeFilter#head(Node, int)}) whether to
 * <ul>
 * <li>continue ({@link HeadFilterDecision#CONTINUE}),</li>
 * <li>skip all children ({@link HeadFilterDecision#SKIP_CHILDREN}),</li>
 * <li>skip node entirely ({@link HeadFilterDecision#SKIP_ENTIRELY}),</li>
 * <li>remove the subtree ({@link HeadFilterDecision#REMOVE}),</li>
 * <li>interrupt the iteration and return ({@link HeadFilterDecision#STOP}).</li>
 * </ul>
 * <p>
 * The difference between {@link HeadFilterDecision#SKIP_CHILDREN} and {@link HeadFilterDecision#SKIP_ENTIRELY} is that
 * the first will invoke {@link NodeFilter#tail(Node, int)} on the node, while the latter will not.
 * </p>
 * <p>
 * When {@link NodeFilter#tail(Node, int)} is called the filter has to decide whether to
 * <ul>
 * <li>continue ({@link TailFilterDecision#CONTINUE}),</li>
 * <li>remove the subtree ({@link TailFilterDecision#REMOVE}),</li>
 * <li>interrupt the iteration and return ({@link TailFilterDecision#STOP}).</li>
 * </ul>
 * </p>
 */
public interface NodeFilter {
    /**
     * Filter decision for {@link NodeFilter#head(Node, int)}.
     */
    enum HeadFilterDecision {
        /**
         * Continue processing the tree.
         */
        CONTINUE,
        /**
         * Skip the child nodes, but do call {@link NodeFilter#tail(Node, int)} next.
         */
        SKIP_CHILDREN,
        /**
         * Skip the subtree, and do not call {@link NodeFilter#tail(Node, int)}.
         */
        SKIP_ENTIRELY,
        /**
         * Remove the node and its children, and do not call {@link NodeFilter#tail(Node, int)}.
         */
        REMOVE,
        /**
         * Stop processing.
         */
        STOP
    }

    /**
     * Filter decision for {@link NodeFilter#tail(Node, int)}.
     */
    enum TailFilterDecision {
        /**
         * Continue processing the tree.
         */
        CONTINUE,
        /**
         * Remove the node and its children.
         */
        REMOVE,
        /**
         * Stop processing.
         */
        STOP
    }

    /**
     * Callback for when a node is first visited.
     *
     * @param node
     *         the node being visited.
     * @param depth
     *         the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
     *         of that will have depth 1.
     *
     * @return Filter decision
     */
    @NonNull
    HeadFilterDecision head(Node node, int depth);

    /**
     * Callback for when a node is last visited, after all of its descendants have been visited.
     *
     * @param node
     *         the node being visited.
     * @param depth
     *         the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
     *         of that will have depth 1.
     *
     * @return Filter decision
     */
    @NonNull
    TailFilterDecision tail(Node node, int depth);
}
+1 −1
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ public class HtmlProcessor {
                HtmlConverter.cssStylePre());
    }

    static String toCompactString(Document document) {
    public static String toCompactString(Document document) {
        document.outputSettings()
                .prettyPrint(false)
                .indentAmount(0);
+81 −71
Original line number Diff line number Diff line
package com.fsck.k9.message.signature;


import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import timber.log.Timber;
import android.support.annotation.NonNull;

import com.fsck.k9.K9;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import com.fsck.k9.helper.jsoup.AdvancedNodeTraversor;
import com.fsck.k9.helper.jsoup.NodeFilter;
import com.fsck.k9.message.html.HtmlProcessor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;


public class HtmlSignatureRemover {
    private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("(<br( /)?>|\r?\n)-- <br( /)?>", Pattern.CASE_INSENSITIVE);
    private static final Pattern BLOCKQUOTE_START = Pattern.compile("<blockquote", Pattern.CASE_INSENSITIVE);
    private static final Pattern BLOCKQUOTE_END = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
    public static String stripSignature(String content) {
        return new HtmlSignatureRemover().stripSignatureInternal(content);
    }

    private String stripSignatureInternal(String content) {
        Document document = Jsoup.parse(content);

    public static String stripSignature(String content) {
        Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
        if (dashSignatureHtml.find()) {
            Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
            Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
            List<Integer> start = new ArrayList<>();
            List<Integer> end = new ArrayList<>();

            while (blockquoteStart.find()) {
                start.add(blockquoteStart.start());
        AdvancedNodeTraversor nodeTraversor = new AdvancedNodeTraversor(new StripSignatureFilter());
        nodeTraversor.filter(document.body());

        return HtmlProcessor.toCompactString(document);
    }


    static class StripSignatureFilter implements NodeFilter {
        private static final Pattern DASH_SIGNATURE_HTML = Pattern.compile("\\s*-- \\s*", Pattern.CASE_INSENSITIVE);
        private static final Tag BLOCKQUOTE = Tag.valueOf("blockquote");
        private static final Tag BR = Tag.valueOf("br");
        private static final Tag P = Tag.valueOf("p");


        private boolean signatureFound = false;
        private boolean lastElementCausedLineBreak = false;
        private Element brElementPrecedingDashes;


        @NonNull
        @Override
        public HeadFilterDecision head(Node node, int depth) {
            if (signatureFound) {
                return HeadFilterDecision.REMOVE;
            }
            while (blockquoteEnd.find()) {
                end.add(blockquoteEnd.start());

            if (node instanceof Element) {
                lastElementCausedLineBreak = false;

                Element element = (Element) node;
                if (element.tag().equals(BLOCKQUOTE)) {
                    return HeadFilterDecision.SKIP_ENTIRELY;
                }
            if (start.size() != end.size()) {
                Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.",
                        start.size(), end.size());
            } else if (start.size() > 0) {
                // Ignore quoted signatures in blockquotes.
                dashSignatureHtml.region(0, start.get(0));
                if (dashSignatureHtml.find()) {
                    // before first <blockquote>.
                    content = content.substring(0, dashSignatureHtml.start());
                } else {
                    for (int i = 0; i < start.size() - 1; i++) {
                        // within blockquotes.
                        if (end.get(i) < start.get(i + 1)) {
                            dashSignatureHtml.region(end.get(i), start.get(i + 1));
                            if (dashSignatureHtml.find()) {
                                content = content.substring(0, dashSignatureHtml.start());
                                break;
            } else if (node instanceof TextNode) {
                TextNode textNode = (TextNode) node;
                if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
                    Node nextNode = node.nextSibling();
                    if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
                        signatureFound = true;
                        if (brElementPrecedingDashes != null) {
                            brElementPrecedingDashes.remove();
                            brElementPrecedingDashes = null;
                        }

                        return HeadFilterDecision.REMOVE;
                    }
                }
                    if (end.get(end.size() - 1) < content.length()) {
                        // after last </blockquote>.
                        dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
            }

            return HeadFilterDecision.CONTINUE;
        }

        @NonNull
        @Override
        public TailFilterDecision tail(Node node, int depth) {
            if (signatureFound) {
                return TailFilterDecision.CONTINUE;
            }
            } else {
                // No blockquotes found.
                content = content.substring(0, dashSignatureHtml.start());

            if (node instanceof Element) {
                Element element = (Element) node;
                boolean elementIsBr = element.tag().equals(BR);
                if (elementIsBr || element.tag().equals(P)) {
                    lastElementCausedLineBreak = true;
                    brElementPrecedingDashes = elementIsBr ? element : null;
                    return TailFilterDecision.CONTINUE;
                }
            }

        // Fix the stripping off of closing tags if a signature was stripped,
        // as well as clean up the HTML of the quoted message.
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties properties = cleaner.getProperties();

        // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
        properties.setNamespacesAware(false);
        properties.setAdvancedXmlEscape(false);
        properties.setOmitXmlDeclaration(true);
        properties.setOmitDoctypeDeclaration(false);
        properties.setTranslateSpecialEntities(false);
        properties.setRecognizeUnicodeChars(false);

        TagNode node = cleaner.clean(content);
        SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
        content = htmlSerialized.getAsString(node, "UTF8");
        return content;
            lastElementCausedLineBreak = false;
            return TailFilterDecision.CONTINUE;
        }
    }
}
+3 −5
Original line number Diff line number Diff line
@@ -3,7 +3,6 @@ package com.fsck.k9.message.signature;

import com.fsck.k9.K9RobolectricTestRunner;

import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.annotation.Config;
@@ -27,7 +26,6 @@ public class HtmlSignatureRemoverTest {
        assertEquals("This is the body text", extractText(withoutSignature));
    }

    @Ignore
    @Test
    public void shouldStripSignatureFromThunderbirdStyleHtml() throws Exception {
        String html = "<html>\r\n" +
@@ -88,8 +86,8 @@ public class HtmlSignatureRemoverTest {
        assertEquals("<html><head></head><body>" +
                        "<blockquote>" +
                        "This is some quoted text" +
                        "<br />" +
                        "-- <br />" +
                        "<br>" +
                        "-- <br>" +
                        "Inner signature" +
                        "</blockquote>" +
                        "<div>This is the body text</div>" +
@@ -141,7 +139,7 @@ public class HtmlSignatureRemoverTest {
        String withoutSignature = HtmlSignatureRemover.stripSignature(html);

        assertEquals("<html><head></head><body>" +
                        "This is the body text<br />" +
                        "This is the body text<br>" +
                        "<blockquote>Some quote</blockquote>" +
                        "</body></html>",
                withoutSignature);