Merge pull request #5765 from k9mail/decode_ISO-2022-JP (e18e6179) · Commits · e / os / Mail

mail/common/build.gradle

+1 −0

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@ dependencies {
		testImplementation "com.google.truth:truth:${versions.truth}"
		testImplementation "org.mockito:mockito-core:${versions.mockito}"
		testImplementation "org.mockito.kotlin:mockito-kotlin:${versions.mockitoKotlin}"
		testImplementation "com.ibm.icu:icu4j-charset:70.1"
		}

		android {

mail/common/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.java

deleted100644 → 0

+0 −217

Original line number	Diff line number	Diff line

		package com.fsck.k9.mail.internet;


		import java.io.ByteArrayInputStream;
		import java.io.IOException;
		import java.io.InputStream;
		import java.nio.charset.Charset;

		import com.fsck.k9.mail.Message;
		import com.fsck.k9.mail.MessagingException;
		import okio.Buffer;
		import okio.ByteString;
		import okio.Okio;
		import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
		import org.apache.james.mime4j.util.CharsetUtil;
		import timber.log.Timber;


		/**
		* Static methods for decoding strings, byte arrays and encoded words.
		*
		* This class is copied from the org.apache.james.mime4j.decoder.DecoderUtil class. It's modified here in order to
		* decode emoji characters in the Subject headers. The method to decode emoji depends on the MimeMessage class because
		* it has to be determined with the sender address, the mailer and so on.
		*/
		class DecoderUtil {
		/**
		* Decodes a string containing encoded words as defined by RFC 2047.
		* Encoded words in have the form
		* =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for
		* quoted-printable and 'B' or 'b' for Base64.
		*
		* ANDROID: COPIED FROM A NEWER VERSION OF MIME4J
		*
		* @param body the string to decode.
		* @param message the message which has the string.
		* @return the decoded string.
		*/
		public static String decodeEncodedWords(String body, Message message) {

		// ANDROID: Most strings will not include "=?" so a quick test can prevent unneeded
		// object creation. This could also be handled via lazy creation of the StringBuilder.
		if (!body.contains("=?")) {
		return body;
		}

		EncodedWord previousWord = null;
		int previousEnd = 0;

		StringBuilder sb = new StringBuilder();

		while (true) {
		int begin = body.indexOf("=?", previousEnd);
		if (begin == -1) {
		decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
		return sb.toString();
		}

		// ANDROID: The mime4j original version has an error here. It gets confused if
		// the encoded string begins with an '=' (just after "?Q?"). This patch seeks forward
		// to find the two '?' in the "header", before looking for the final "?=".
		int qm1 = body.indexOf('?', begin + 2);
		if (qm1 == -1) {
		decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
		return sb.toString();
		}

		int qm2 = body.indexOf('?', qm1 + 1);
		if (qm2 == -1) {
		decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
		return sb.toString();
		}

		int end = body.indexOf("?=", qm2 + 1);
		if (end == -1) {
		decodePreviousAndAppendSuffix(sb, previousWord, body, previousEnd);
		return sb.toString();
		}
		end += 2;

		String sep = body.substring(previousEnd, begin);

		EncodedWord word = extractEncodedWord(body, begin, end, message);

		if (previousWord == null) {
		sb.append(sep);
		if (word == null) {
		sb.append(body, begin, end);
		}
		} else {
		if (word == null) {
		sb.append(charsetDecode(previousWord));
		sb.append(sep);
		sb.append(body, begin, end);
		} else {
		if (!CharsetUtil.isWhitespace(sep)) {
		sb.append(charsetDecode(previousWord));
		sb.append(sep);
		} else if (previousWord.encoding.equals(word.encoding) &&
		previousWord.charset.equals(word.charset)) {
		word.data = concat(previousWord.data, word.data);
		} else {
		sb.append(charsetDecode(previousWord));
		}
		}
		}

		previousWord = word;
		previousEnd = end;
		}
		}

		private static void decodePreviousAndAppendSuffix(StringBuilder sb, EncodedWord previousWord, String body,
		int previousEnd) {

		if (previousWord != null) {
		sb.append(charsetDecode(previousWord));
		}

		sb.append(body.substring(previousEnd));
		}

		private static String charsetDecode(EncodedWord word) {
		try {
		InputStream inputStream = new Buffer().write(word.data).inputStream();
		return CharsetSupport.readToString(inputStream, word.charset);
		} catch (IOException e) {
		return null;
		}
		}

		private static EncodedWord extractEncodedWord(String body, int begin, int end, Message message) {
		int qm1 = body.indexOf('?', begin + 2);
		if (qm1 == end - 2)
		return null;

		int qm2 = body.indexOf('?', qm1 + 1);
		if (qm2 == end - 2)
		return null;

		// Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
		String charsetPart = body.substring(begin + 2, qm1);
		int languageSuffixStart = charsetPart.indexOf('*');
		boolean languageSuffixFound = languageSuffixStart != -1;
		String mimeCharset = languageSuffixFound ? charsetPart.substring(0, languageSuffixStart) : charsetPart;

		String encoding = body.substring(qm1 + 1, qm2);
		String encodedText = body.substring(qm2 + 1, end - 2);

		String charset;
		try {
		charset = CharsetSupport.fixupCharset(mimeCharset, message);
		} catch (MessagingException e) {
		return null;
		}

		if (encodedText.isEmpty()) {
		Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end));
		return null;
		}

		EncodedWord encodedWord = new EncodedWord();
		encodedWord.charset = charset;
		if (encoding.equalsIgnoreCase("Q")) {
		encodedWord.encoding = "Q";
		encodedWord.data = decodeQ(encodedText);
		} else if (encoding.equalsIgnoreCase("B")) {
		encodedWord.encoding = "B";
		encodedWord.data = decodeB(encodedText);
		} else {
		Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end));
		return null;
		}
		return encodedWord;
		}

		private static ByteString decodeQ(String encodedWord) {
		/*
		* Replace _ with =20
		*/
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < encodedWord.length(); i++) {
		char c = encodedWord.charAt(i);
		if (c == '_') {
		sb.append("=20");
		} else {
		sb.append(c);
		}
		}

		byte[] bytes = sb.toString().getBytes(Charset.forName("US-ASCII"));

		QuotedPrintableInputStream is = new QuotedPrintableInputStream(new ByteArrayInputStream(bytes));
		try {
		return Okio.buffer(Okio.source(is)).readByteString();
		} catch (IOException e) {
		return null;
		}
		}

		private static ByteString decodeB(String encodedText) {
		ByteString decoded = ByteString.decodeBase64(encodedText);
		return decoded == null ? ByteString.EMPTY : decoded;
		}

		private static ByteString concat(ByteString first, ByteString second) {
		return new Buffer().write(first).write(second).readByteString();
		}


		private static class EncodedWord {
		private String charset;
		private String encoding;
		private ByteString data;
		}
		}

mail/common/src/main/java/com/fsck/k9/mail/internet/DecoderUtil.kt

0 → 100644

+201 −0

Original line number	Diff line number	Diff line
		package com.fsck.k9.mail.internet

		import com.fsck.k9.mail.Message
		import com.fsck.k9.mail.MessagingException
		import java.io.ByteArrayInputStream
		import java.io.IOException
		import okio.Buffer
		import okio.ByteString
		import okio.ByteString.Companion.decodeBase64
		import okio.buffer
		import okio.source
		import org.apache.james.mime4j.codec.QuotedPrintableInputStream
		import org.apache.james.mime4j.util.CharsetUtil
		import timber.log.Timber

		/**
		* Decoder for encoded words (RFC 2047).
		*
		* This class is based on `org.apache.james.mime4j.decoder.DecoderUtil`. It was modified in order to support early
		* non-Unicode emoji variants.
		*/
		internal object DecoderUtil {
		/**
		* Decodes a string containing encoded words as defined by RFC 2047.
		*
		* Encoded words have the form `=?charset?enc?Encoded word?=` where `enc` is either 'Q' or 'q' for
		* quoted-printable and 'B' or 'b' for Base64.
		*
		* @param body The string to decode.
		* @param message The message containing the string. It will be used to figure out which JIS variant to use for
		* charset decoding. May be `null`.
		* @return The decoded string.
		*/
		@JvmStatic
		fun decodeEncodedWords(body: String, message: Message?): String {
		// Most strings will not include "=?". So a quick test can prevent unneeded work.
		if (!body.contains("=?")) return body

		var previousWord: EncodedWord? = null
		var previousEnd = 0
		val output = StringBuilder()

		while (true) {
		val begin = body.indexOf("=?", previousEnd)
		if (begin == -1) {
		decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
		return output.toString()
		}

		val qm1 = body.indexOf('?', begin + 2)
		if (qm1 == -1) {
		decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
		return output.toString()
		}

		val qm2 = body.indexOf('?', qm1 + 1)
		if (qm2 == -1) {
		decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
		return output.toString()
		}

		var end = body.indexOf("?=", qm2 + 1)
		if (end == -1) {
		decodePreviousAndAppendSuffix(output, previousWord, body, previousEnd)
		return output.toString()
		}
		end += 2

		val sep = body.substring(previousEnd, begin)
		val word = extractEncodedWord(body, begin, end, message)

		if (previousWord == null) {
		output.append(sep)
		if (word == null) {
		output.append(body, begin, end)
		}
		} else if (word == null) {
		output.append(charsetDecode(previousWord))
		output.append(sep)
		output.append(body, begin, end)
		} else if (!CharsetUtil.isWhitespace(sep)) {
		output.append(charsetDecode(previousWord))
		output.append(sep)
		} else if (previousWord.canBeCombinedWith(word)) {
		word.data = previousWord.data + word.data
		} else {
		output.append(charsetDecode(previousWord))
		}

		previousWord = word
		previousEnd = end
		}
		}

		private fun decodePreviousAndAppendSuffix(
		output: StringBuilder,
		previousWord: EncodedWord?,
		body: String,
		previousEnd: Int
		) {
		if (previousWord != null) {
		output.append(charsetDecode(previousWord))
		}
		output.append(body, previousEnd, body.length)
		}

		private fun charsetDecode(word: EncodedWord): String? {
		return try {
		val inputStream = Buffer().write(word.data).inputStream()
		CharsetSupport.readToString(inputStream, word.charset)
		} catch (e: IOException) {
		null
		}
		}

		private fun extractEncodedWord(body: String, begin: Int, end: Int, message: Message?): EncodedWord? {
		val qm1 = body.indexOf('?', begin + 2)
		if (qm1 == end - 2) return null

		val qm2 = body.indexOf('?', qm1 + 1)
		if (qm2 == end - 2) return null

		// Extract charset, skipping language information if present (example: =?utf-8*en?Q?Text?=)
		val charsetPart = body.substring(begin + 2, qm1)
		val languageSuffixStart = charsetPart.indexOf('*')
		val languageSuffixFound = languageSuffixStart != -1
		val mimeCharset = if (languageSuffixFound) charsetPart.substring(0, languageSuffixStart) else charsetPart

		val encoding = body.substring(qm1 + 1, qm2)
		val encodedText = body.substring(qm2 + 1, end - 2)

		val charset = try {
		CharsetSupport.fixupCharset(mimeCharset, message)
		} catch (e: MessagingException) {
		return null
		}

		if (encodedText.isEmpty()) {
		Timber.w("Missing encoded text in encoded word: '%s'", body.substring(begin, end))
		return null
		}

		return if (encoding.equals("Q", ignoreCase = true)) {
		EncodedWord(charset, Encoding.Q, decodeQ(encodedText))
		} else if (encoding.equals("B", ignoreCase = true)) {
		EncodedWord(charset, Encoding.B, decodeB(encodedText))
		} else {
		Timber.w("Warning: Unknown encoding in encoded word '%s'", body.substring(begin, end))
		null
		}
		}

		private fun decodeQ(encodedWord: String): ByteString {
		// Replace _ with =20
		val bytes = buildString {
		for (character in encodedWord) {
		if (character == '_') {
		append("=20")
		} else {
		append(character)
		}
		}
		}.toByteArray(Charsets.US_ASCII)

		return QuotedPrintableInputStream(ByteArrayInputStream(bytes)).use { inputStream ->
		try {
		inputStream.source().buffer().readByteString()
		} catch (e: IOException) {
		ByteString.EMPTY
		}
		}
		}

		private fun decodeB(encodedText: String): ByteString {
		return encodedText.decodeBase64() ?: ByteString.EMPTY
		}

		private operator fun ByteString.plus(second: ByteString): ByteString {
		return Buffer().write(this).write(second).readByteString()
		}

		private val ASCII_ESCAPE_SEQUENCE = byteArrayOf(0x1B, 0x28, 0x42)

		private class EncodedWord(
		val charset: String,
		val encoding: Encoding,
		var data: ByteString
		) {
		fun canBeCombinedWith(other: EncodedWord): Boolean {
		return encoding == other.encoding && charset == other.charset && !isAsciiEscapeSequence()
		}

		private fun isAsciiEscapeSequence(): Boolean {
		return charset.startsWith("ISO-2022-JP", ignoreCase = true) && data.endsWith(ASCII_ESCAPE_SEQUENCE)
		}
		}

		private enum class Encoding {
		Q, B
		}
		}

mail/common/src/test/java/com/fsck/k9/mail/internet/DecoderUtilTest.java

+10 −0

Original line number	Diff line number	Diff line
		@@ -220,6 +220,16 @@ public class DecoderUtilTest {
		assertInputDecodesToExpected("=?utf-8*de?b?R3LDvMOfZQ==?=", "Grüße");
		}

		@Test
		public void decodeEncodedWords_withMultipleIso2022JpEncodedWordsProperlyEndingWithSwitchingToAscii() {
		// If we try to combine the base64-decoded data of both encoded words and only then perform the charset
		// decoding, we end up with an escape sequence switching to ASCII (end of first encoded word) followed by an
		// escape sequence switching to JIS X 0208:1983 (start of second encoded word). The decoder on Android reports
		// an error for this case, leading to a replacement character being inserted.
		// We use the ISO-2022-JP-TEST charset to get Android's behavior on the JVM. See TestCharsetProvider.
		assertInputDecodesToExpected("=?ISO-2022-JP-TEST?B?GyRCRnxLXDhsJEhGfEtcOGwkSEZ8S1w4bCROJUElJyVDGyhC?=\r\n" +
		" =?ISO-2022-JP-TEST?B?GyRCJS8bKEI=?=", "日本語と日本語と日本語のチェック");
		}

		private void assertInputDecodesToExpected(String input, String expected) {
		String decodedText = DecoderUtil.decodeEncodedWords(input, null);

mail/common/src/test/java/com/fsck/k9/mail/internet/TestCharsetProvider.kt

0 → 100644

+28 −0

Original line number	Diff line number	Diff line
		package com.fsck.k9.mail.internet

		import com.ibm.icu.charset.CharsetProviderICU
		import java.nio.charset.Charset
		import java.nio.charset.spi.CharsetProvider

		/**
		* CharsetProvider that adds the "ISO-2022-JP-TEST" charset.
		*
		* The "ISO-2022-JP" decoder on the JVM is more lenient than the ICU4J decoder that is used on Android. For tests we
		* use the ICU4J implementation that is also used on Android.
		*/
		class TestCharsetProvider : CharsetProvider() {
		private val icuCharsetProvider = CharsetProviderICU()
		private val charset = icuCharsetProvider.charsetForName("ISO-2022-JP")

		override fun charsets(): Iterator<Charset> {
		return listOf(charset).iterator()
		}

		override fun charsetForName(charsetName: String?): Charset? {
		return if (charsetName?.equals("ISO-2022-JP-TEST", ignoreCase = true) == true) {
		charset
		} else {
		null
		}
		}
		}