Utf8Utils for dicttoolkit. (f0c303dd) · Commits · e / os / android_packages_inputmethods_LatinIME

native/dicttoolkit/NativeFileList.mk

+5 −2

Original line number	Diff line number	Diff line
		@@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \
		makedict_executor.cpp) \
		$(addprefix offdevice_intermediate_dict/, \
		offdevice_intermediate_dict.cpp) \
		utils/command_utils.cpp
		$(addprefix utils/, \
		command_utils.cpp \
		utf8_utils.cpp)

		LATIN_IME_DICT_TOOLKIT_TEST_FILES := \
		dict_toolkit_defines_test.cpp \
		$(addprefix offdevice_intermediate_dict/, \
		offdevice_intermediate_dict_test.cpp) \
		$(addprefix utils/, \
		command_utils_test.cpp)
		command_utils_test.cpp \
		utf8_utils_test.cpp)

native/dicttoolkit/src/utils/utf8_utils.cpp

0 → 100644

+119 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2014 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		#include "utils/utf8_utils.h"

		#include "utils/char_utils.h"

		namespace latinime {
		namespace dicttoolkit {

		const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
		const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
		const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
		const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
		const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};

		const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
		const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
		const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;

		/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
		std::vector<int> codePoints;
		int remainingByteCountForCurrentCodePoint = 0;
		int currentCodePointSequenceSize = 0;
		int codePoint = 0;
		for (const char c : utf8Str) {
		if (remainingByteCountForCurrentCodePoint == 0) {
		currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
		if (currentCodePointSequenceSize <= 0) {
		AKLOGE("%x is an invalid utf8 first byte value.", c);
		return std::vector<int>();
		}
		remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
		codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
		} else {
		codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
		codePoint += maskTrailingByte(c);
		}
		remainingByteCountForCurrentCodePoint--;
		if (remainingByteCountForCurrentCodePoint == 0) {
		if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
		AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
		currentCodePointSequenceSize, codePoint);
		return std::vector<int>();
		}
		codePoints.push_back(codePoint);
		}
		}
		return codePoints;
		}

		/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
		for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
		if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
		return i;
		}
		}
		// Not a valid utf8 char first byte.
		return -1;
		}

		/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
		const int sequenceSize) {
		return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
		}

		/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
		return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
		}

		/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
		std::string utf8String;
		for (const int codePoint : codePoints) {
		const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
		if (sequenceSize <= 0) {
		AKLOGE("Cannot encode code point (%d).", codePoint);
		return std::string();
		}
		const int trailingByteCount = sequenceSize - 1;
		// Output first byte.
		const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
		utf8String.push_back(static_cast<char>(value \| FIRST_BYTE_MARKERS[sequenceSize]));
		// Output second and later bytes.
		for (int i = 1; i < sequenceSize; ++i) {
		const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
		const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
		utf8String.push_back(static_cast<char>(value \| TRAILING_BYTE_MARKER));
		}
		}
		return utf8String;
		}

		/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
		if (codePoint < 0) {
		return -1;
		}
		for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
		if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
		return i;
		}
		}
		return -1;
		}

		} // namespace dicttoolkit
		} // namespace latinime

native/dicttoolkit/src/utils/utf8_utils.h

0 → 100644

+56 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2014 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
		#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H

		#include <cstdint>
		#include <string>
		#include <vector>

		#include "dict_toolkit_defines.h"
		#include "utils/int_array_view.h"

		namespace latinime {
		namespace dicttoolkit {

		class Utf8Utils {
		public:
		static std::vector<int> getCodePoints(const std::string &utf8Str);
		static std::string getUtf8String(const CodePointArrayView codePoints);

		private:
		DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils);

		// Values indexed by sequence size.
		static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT;
		static const uint8_t FIRST_BYTE_MARKER_MASKS[];
		static const uint8_t FIRST_BYTE_MARKERS[];
		static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[];
		static const int MAX_ENCODED_CODE_POINT_VALUES[];

		static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK;
		static const uint8_t TRAILING_BYTE_MARKER;
		static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;

		static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte);
		static int maskFirstByte(const uint8_t firstByte, const int encodeSize);
		static int maskTrailingByte(const uint8_t secondOrLaterByte);
		static int getSequenceSizeToEncodeCodePoint(const int codePoint);
		};
		} // namespace dicttoolkit
		} // namespace latinime
		#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H

native/dicttoolkit/tests/utils/utf8_utils_test.cpp

0 → 100644

+85 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2014 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		#include "utils/utf8_utils.h"

		#include <gtest/gtest.h>

		#include <vector>

		#include "utils/int_array_view.h"

		namespace latinime {
		namespace dicttoolkit {
		namespace {

		TEST(Utf8UtilsTests, TestGetCodePoints) {
		{
		const std::vector<int> codePoints = Utf8Utils::getCodePoints("");
		EXPECT_EQ(0u, codePoints.size());
		}
		{
		const std::vector<int> codePoints = Utf8Utils::getCodePoints("test");
		EXPECT_EQ(4u, codePoints.size());
		EXPECT_EQ('t', codePoints[0]);
		EXPECT_EQ('e', codePoints[1]);
		EXPECT_EQ('s', codePoints[2]);
		EXPECT_EQ('t', codePoints[3]);
		}
		{
		const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410");
		EXPECT_EQ(4u, codePoints.size());
		EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A
		EXPECT_EQ('a', codePoints[1]);
		EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A
		EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA
		}
		{
		const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752");
		EXPECT_EQ(3u, codePoints.size());
		EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE
		EXPECT_EQ('?', codePoints[1]);
		EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT
		}

		// Redundant UTF-8 sequences must be rejected.
		EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty());
		EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty());
		EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty());
		}

		TEST(Utf8UtilsTests, TestGetUtf8String) {
		{
		const std::vector<int> codePoints = {'t', 'e', 's', 't'};
		EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
		}
		{
		const std::vector<int> codePoints = {
		0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
		0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
		0x0430 /* CYRILLIC SMALL LETTER A */,
		0x3042 /* HIRAGANA LETTER A */,
		0x1F36A /* COOKIE */,
		0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */
		};
		EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752",
		Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
		}
		}

		} // namespace
		} // namespace dicttoolkit
		} // namespace latinime