Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit e18e6179 authored by cketti's avatar cketti Committed by GitHub
Browse files

Merge pull request #5765 from k9mail/decode_ISO-2022-JP

Properly decode multiple encoded-words using ISO-2022-JP
parents 3887681d 9861fc4d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ dependencies {
    testImplementation "com.google.truth:truth:${versions.truth}"
    testImplementation "org.mockito:mockito-core:${versions.mockito}"
    testImplementation "org.mockito.kotlin:mockito-kotlin:${versions.mockitoKotlin}"
    testImplementation "com.ibm.icu:icu4j-charset:70.1"
}

android {
+0 −217
Original line number Diff line number Diff line

package com.fsck.k9.mail.internet;


import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import com.fsck.k9.mail.Message;
import com.fsck.k9.mail.MessagingException;
import okio.Buffer;
import okio.ByteString;
import okio.Okio;
import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
import org.apache.james.mime4j.util.CharsetUtil;
import timber.log.Timber;


/**
 * Static methods for decoding strings, byte arrays and encoded words.
 *
 * This class is copied from the org.apache.james.mime4j.decoder.DecoderUtil class.  It's modified here in order to
 * decode emoji characters in the Subject headers.  The method to decode emoji depends on the MimeMessage class because
 * it has to be determined with the sender address, the mailer and so on.
 */
class DecoderUtil {
    /**
     * Decodes a string containing encoded words as defined by RFC 2047.
     * Encoded words in have the form
     * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for
     * quoted-printable and 'B' or 'b' for Base64.
     *
     * ANDROID:  COPIED FROM A NEWER VERSION OF MIME4J
     *
     * @param body the string to decode.
     * @param message the message which has the string.
     * @return the decoded string.
     */
    public static String decodeEncodedWords(String body, Message message) {

        // ANDROID:  Most strings will not include "=?" so a quick test can prevent unneeded
        // object creation.  This could also be handled via lazy creation of the StringBuilder.
        if (!body.contains("=?")) {
            return body;
        }

        EncodedWord previousWord = null;
        int previousEnd = 0;

        StringBuilder sb = new StringBuilder();

        while (true) {
            int begin = body.indexOf("=?", previousEnd);
            if (begin == -1) {
                decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
                return sb.toString();
            }

            // ANDROID:  The mime4j original version has an error here.  It gets confused if
            // the encoded string begins with an '=' (just after "?Q?").  This patch seeks forward
            // to find the two '?' in the "header", before looking for the final "?=".
            int qm1 = body.indexOf('?', begin + 2);
            if (qm1 == -1) {
                decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
                return sb.toString();
            }

            int qm2 = body.indexOf('?', qm1 + 1);
            if (qm2 == -1) {
                decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
                return sb.toString();
            }

            int end = body.indexOf("?=", qm2 + 1);
            if (end == -1) {
                decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
                return sb.toString();
            }
            end += 2;

            String sep = body.substring(previousEnd, begin);

            EncodedWord word = extractEncodedWord(body, begin, end, message);

            if (previousWord == null) {
                sb.append(sep);
                if (word == null) {
                    sb.append(body, begin, end);
                }
            } else {
                if (word == null) {
                    sb.append(charsetDecode(previousWord));
                    sb.append(sep);
                    sb.append(body, begin, end);
                } else {
                    if (!CharsetUtil.isWhitespace(sep)) {
                        sb.append(charsetDecode(previousWord));
                        sb.append(sep);
                    } else if (previousWord.encoding.equals(word.encoding) &&
                            previousWord.charset.equals(word.charset)) {
                        word.data = concat(previousWord.data, word.data);
                    } else {
                        sb.append(charsetDecode(previousWord));
                    }
                }
            }

            previousWord = word;
            previousEnd = end;
        }
    }

    private static void decodePreviousAndAppendSuffix(StringBuilder sb, EncodedWord previousWord, String body,
            int previousEnd) {

        if (previousWord != null) {
            sb.append(charsetDecode(previousWord));
        }

        sb.append(body.substring(previousEnd));
    }

    private static String charsetDecode(EncodedWord word) {
        try {
            InputStream inputStream = new Buffer().write(word.data).inputStream();
            return CharsetSupport.readToString(inputStream, word.charset);
        } catch (IOException e) {
            return null;
        }
    }

    private static EncodedWord extractEncodedWord(String body, int begin, int end, Message message) {
        int qm1 = body.indexOf('?', begin + 2);
        if (qm1 == end - 2)
            return null;

        int qm2 = body.indexOf('?', qm1 + 1);
        if (qm2 == end - 2)
            return null;

        // Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
        String charsetPart = body.substring(begin + 2, qm1);
        int languageSuffixStart = charsetPart.indexOf('*');
        boolean languageSuffixFound = languageSuffixStart != -1;
        String mimeCharset = languageSuffixFound ? charsetPart.substring(0, languageSuffixStart) : charsetPart;

        String encoding = body.substring(qm1 + 1, qm2);
        String encodedText = body.substring(qm2 + 1, end - 2);

        String charset;
        try {
            charset = CharsetSupport.fixupCharset(mimeCharset, message);
        } catch (MessagingException e) {
            return null;
        }

        if (encodedText.isEmpty()) {
            Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end));
            return null;
        }

        EncodedWord encodedWord = new EncodedWord();
        encodedWord.charset = charset;
        if (encoding.equalsIgnoreCase("Q")) {
            encodedWord.encoding = "Q";
            encodedWord.data = decodeQ(encodedText);
        } else if (encoding.equalsIgnoreCase("B")) {
            encodedWord.encoding = "B";
            encodedWord.data = decodeB(encodedText);
        } else {
            Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end));
            return null;
        }
        return encodedWord;
    }

    private static ByteString decodeQ(String encodedWord) {
        /*
         * Replace _ with =20
         */
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < encodedWord.length(); i++) {
            char c = encodedWord.charAt(i);
            if (c == '_') {
                sb.append("=20");
            } else {
                sb.append(c);
            }
        }

        byte[] bytes = sb.toString().getBytes(Charset.forName("US-ASCII"));

        QuotedPrintableInputStream is = new QuotedPrintableInputStream(new ByteArrayInputStream(bytes));
        try {
            return Okio.buffer(Okio.source(is)).readByteString();
        } catch (IOException e) {
            return null;
        }
    }

    private static ByteString decodeB(String encodedText) {
        ByteString decoded = ByteString.decodeBase64(encodedText);
        return decoded == null ? ByteString.EMPTY : decoded;
    }

    private static ByteString concat(ByteString first, ByteString second) {
        return new Buffer().write(first).write(second).readByteString();
    }


    private static class EncodedWord {
        private String charset;
        private String encoding;
        private ByteString data;
    }
}
+201 −0
Original line number Diff line number Diff line
package com.fsck.k9.mail.internet

import com.fsck.k9.mail.Message
import com.fsck.k9.mail.MessagingException
import java.io.ByteArrayInputStream
import java.io.IOException
import okio.Buffer
import okio.ByteString
import okio.ByteString.Companion.decodeBase64
import okio.buffer
import okio.source
import org.apache.james.mime4j.codec.QuotedPrintableInputStream
import org.apache.james.mime4j.util.CharsetUtil
import timber.log.Timber

/**
 * Decoder for encoded words (RFC 2047).
 *
 * This class is based on `org.apache.james.mime4j.decoder.DecoderUtil`. It was modified in order to support early
 * non-Unicode emoji variants.
 */
internal object DecoderUtil {
    /**
     * Decodes a string containing encoded words as defined by RFC 2047.
     *
     * Encoded words have the form `=?charset?enc?Encoded word?=` where `enc` is either 'Q' or 'q' for
     * quoted-printable and 'B' or 'b' for Base64.
     *
     * @param body The string to decode.
     * @param message The message containing the string. It will be used to figure out which JIS variant to use for
     *     charset decoding. May be `null`.
     * @return The decoded string.
     */
    @JvmStatic
    fun decodeEncodedWords(body: String, message: Message?): String {
        // Most strings will not include "=?". So a quick test can prevent unneeded work.
        if (!body.contains("=?")) return body

        var previousWord: EncodedWord? = null
        var previousEnd = 0
        val output = StringBuilder()

        while (true) {
            val begin = body.indexOf("=?", previousEnd)
            if (begin == -1) {
                decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
                return output.toString()
            }

            val qm1 = body.indexOf('?', begin + 2)
            if (qm1 == -1) {
                decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
                return output.toString()
            }

            val qm2 = body.indexOf('?', qm1 + 1)
            if (qm2 == -1) {
                decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
                return output.toString()
            }

            var end = body.indexOf("?=", qm2 + 1)
            if (end == -1) {
                decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
                return output.toString()
            }
            end += 2

            val sep = body.substring(previousEnd, begin)
            val word = extractEncodedWord(body, begin, end, message)

            if (previousWord == null) {
                output.append(sep)
                if (word == null) {
                    output.append(body, begin, end)
                }
            } else if (word == null) {
                output.append(charsetDecode(previousWord))
                output.append(sep)
                output.append(body, begin, end)
            } else if (!CharsetUtil.isWhitespace(sep)) {
                output.append(charsetDecode(previousWord))
                output.append(sep)
            } else if (previousWord.canBeCombinedWith(word)) {
                word.data = previousWord.data + word.data
            } else {
                output.append(charsetDecode(previousWord))
            }

            previousWord = word
            previousEnd = end
        }
    }

    private fun decodePreviousAndAppendSuffix(
        output: StringBuilder,
        previousWord: EncodedWord?,
        body: String,
        previousEnd: Int
    ) {
        if (previousWord != null) {
            output.append(charsetDecode(previousWord))
        }
        output.append(body, previousEnd, body.length)
    }

    private fun charsetDecode(word: EncodedWord): String? {
        return try {
            val inputStream = Buffer().write(word.data).inputStream()
            CharsetSupport.readToString(inputStream, word.charset)
        } catch (e: IOException) {
            null
        }
    }

    private fun extractEncodedWord(body: String, begin: Int, end: Int, message: Message?): EncodedWord? {
        val qm1 = body.indexOf('?', begin + 2)
        if (qm1 == end - 2) return null

        val qm2 = body.indexOf('?', qm1 + 1)
        if (qm2 == end - 2) return null

        // Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
        val charsetPart = body.substring(begin + 2, qm1)
        val languageSuffixStart = charsetPart.indexOf('*')
        val languageSuffixFound = languageSuffixStart != -1
        val mimeCharset = if (languageSuffixFound) charsetPart.substring(0, languageSuffixStart) else charsetPart

        val encoding = body.substring(qm1 + 1, qm2)
        val encodedText = body.substring(qm2 + 1, end - 2)

        val charset = try {
            CharsetSupport.fixupCharset(mimeCharset, message)
        } catch (e: MessagingException) {
            return null
        }

        if (encodedText.isEmpty()) {
            Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end))
            return null
        }

        return if (encoding.equals("Q", ignoreCase = true)) {
            EncodedWord(charset, Encoding.Q, decodeQ(encodedText))
        } else if (encoding.equals("B", ignoreCase = true)) {
            EncodedWord(charset, Encoding.B, decodeB(encodedText))
        } else {
            Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end))
            null
        }
    }

    private fun decodeQ(encodedWord: String): ByteString {
        // Replace _ with =20
        val bytes = buildString {
            for (character in encodedWord) {
                if (character == '_') {
                    append("=20")
                } else {
                    append(character)
                }
            }
        }.toByteArray(Charsets.US_ASCII)

        return QuotedPrintableInputStream(ByteArrayInputStream(bytes)).use { inputStream ->
            try {
                inputStream.source().buffer().readByteString()
            } catch (e: IOException) {
                ByteString.EMPTY
            }
        }
    }

    private fun decodeB(encodedText: String): ByteString {
        return encodedText.decodeBase64() ?: ByteString.EMPTY
    }

    private operator fun ByteString.plus(second: ByteString): ByteString {
        return Buffer().write(this).write(second).readByteString()
    }

    private val ASCII_ESCAPE_SEQUENCE = byteArrayOf(0x1B, 0x28, 0x42)

    private class EncodedWord(
        val charset: String,
        val encoding: Encoding,
        var data: ByteString
    ) {
        fun canBeCombinedWith(other: EncodedWord): Boolean {
            return encoding == other.encoding && charset == other.charset && !isAsciiEscapeSequence()
        }

        private fun isAsciiEscapeSequence(): Boolean {
            return charset.startsWith("ISO-2022-JP", ignoreCase = true) && data.endsWith(ASCII_ESCAPE_SEQUENCE)
        }
    }

    private enum class Encoding {
        Q, B
    }
}
+10 −0
Original line number Diff line number Diff line
@@ -220,6 +220,16 @@ public class DecoderUtilTest {
        assertInputDecodesToExpected("=?utf-8*de?b?R3LDvMOfZQ==?=", "Grüße");
    }

    @Test
    public void decodeEncodedWords_withMultipleIso2022JpEncodedWordsProperlyEndingWithSwitchingToAscii() {
        // If we try to combine the base64-decoded data of both encoded words and only then perform the charset
        // decoding, we end up with an escape sequence switching to ASCII (end of first encoded word) followed by an
        // escape sequence switching to JIS X 0208:1983 (start of second encoded word). The decoder on Android reports
        // an error for this case, leading to a replacement character being inserted.
        // We use the ISO-2022-JP-TEST charset to get Android's behavior on the JVM. See TestCharsetProvider.
        assertInputDecodesToExpected("=?ISO-2022-JP-TEST?B?GyRCRnxLXDhsJEhGfEtcOGwkSEZ8S1w4bCROJUElJyVDGyhC?=\r\n" +
                " =?ISO-2022-JP-TEST?B?GyRCJS8bKEI=?=", "日本語と日本語と日本語のチェック");
    }

    private void assertInputDecodesToExpected(String input, String expected) {
        String decodedText = DecoderUtil.decodeEncodedWords(input, null);
+28 −0
Original line number Diff line number Diff line
package com.fsck.k9.mail.internet

import com.ibm.icu.charset.CharsetProviderICU
import java.nio.charset.Charset
import java.nio.charset.spi.CharsetProvider

/**
 * CharsetProvider that adds the "ISO-2022-JP-TEST" charset.
 *
 * The "ISO-2022-JP" decoder on the JVM is more lenient than the ICU4J decoder that is used on Android. For tests we
 * use the ICU4J implementation that is also used on Android.
 */
class TestCharsetProvider : CharsetProvider() {
    private val icuCharsetProvider = CharsetProviderICU()
    private val charset = icuCharsetProvider.charsetForName("ISO-2022-JP")

    override fun charsets(): Iterator<Charset> {
        return listOf(charset).iterator()
    }

    override fun charsetForName(charsetName: String?): Charset? {
        return if (charsetName?.equals("ISO-2022-JP-TEST", ignoreCase = true) == true) {
            charset
        } else {
            null
        }
    }
}
Loading