Loading native/dicttoolkit/NativeFileList.mk +5 −2 Original line number Diff line number Diff line Loading @@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \ makedict_executor.cpp) \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict.cpp) \ utils/command_utils.cpp $(addprefix utils/, \ command_utils.cpp \ utf8_utils.cpp) LATIN_IME_DICT_TOOLKIT_TEST_FILES := \ dict_toolkit_defines_test.cpp \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict_test.cpp) \ $(addprefix utils/, \ command_utils_test.cpp) command_utils_test.cpp \ utf8_utils_test.cpp) native/dicttoolkit/src/utils/utf8_utils.cpp 0 → 100644 +119 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf8_utils.h" #include "utils/char_utils.h" namespace latinime { namespace dicttoolkit { const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; /* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) { std::vector<int> codePoints; int remainingByteCountForCurrentCodePoint = 0; int currentCodePointSequenceSize = 0; int codePoint = 0; for (const char c : utf8Str) { if (remainingByteCountForCurrentCodePoint == 0) { currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); if (currentCodePointSequenceSize <= 0) { AKLOGE("%x is an invalid utf8 first byte value.", c); return std::vector<int>(); } remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); } else { codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; codePoint += maskTrailingByte(c); } remainingByteCountForCurrentCodePoint--; if (remainingByteCountForCurrentCodePoint == 0) { if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", currentCodePointSequenceSize, codePoint); return std::vector<int>(); } codePoints.push_back(codePoint); } } return codePoints; } /* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { return i; } } // Not a valid utf8 char first byte. return -1; } /* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, const int sequenceSize) { return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; } /* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; } /* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { std::string utf8String; for (const int codePoint : codePoints) { const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); if (sequenceSize <= 0) { AKLOGE("Cannot encode code point (%d).", codePoint); return std::string(); } const int trailingByteCount = sequenceSize - 1; // Output first byte. const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize])); // Output second and later bytes. for (int i = 1; i < sequenceSize; ++i) { const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER)); } } return utf8String; } /* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { if (codePoint < 0) { return -1; } for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { return i; } } return -1; } } // namespace dicttoolkit } // namespace latinime native/dicttoolkit/src/utils/utf8_utils.h 0 → 100644 +56 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H #define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H #include <cstdint> #include <string> #include <vector> #include "dict_toolkit_defines.h" #include "utils/int_array_view.h" namespace latinime { namespace dicttoolkit { class Utf8Utils { public: static std::vector<int> getCodePoints(const std::string &utf8Str); static std::string getUtf8String(const CodePointArrayView codePoints); private: DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils); // Values indexed by sequence size. static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; static const uint8_t FIRST_BYTE_MARKER_MASKS[]; static const uint8_t FIRST_BYTE_MARKERS[]; static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[]; static const int MAX_ENCODED_CODE_POINT_VALUES[]; static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK; static const uint8_t TRAILING_BYTE_MARKER; static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte); static int maskFirstByte(const uint8_t firstByte, const int encodeSize); static int maskTrailingByte(const uint8_t secondOrLaterByte); static int getSequenceSizeToEncodeCodePoint(const int codePoint); }; } // namespace dicttoolkit } // namespace latinime #endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H native/dicttoolkit/tests/utils/utf8_utils_test.cpp 0 → 100644 +85 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf8_utils.h" #include <gtest/gtest.h> #include <vector> #include "utils/int_array_view.h" namespace latinime { namespace dicttoolkit { namespace { TEST(Utf8UtilsTests, TestGetCodePoints) { { const std::vector<int> codePoints = Utf8Utils::getCodePoints(""); EXPECT_EQ(0u, codePoints.size()); } { const std::vector<int> codePoints = Utf8Utils::getCodePoints("test"); EXPECT_EQ(4u, codePoints.size()); EXPECT_EQ('t', codePoints[0]); EXPECT_EQ('e', codePoints[1]); EXPECT_EQ('s', codePoints[2]); EXPECT_EQ('t', codePoints[3]); } { const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410"); EXPECT_EQ(4u, codePoints.size()); EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A EXPECT_EQ('a', codePoints[1]); EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA } { const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752"); EXPECT_EQ(3u, codePoints.size()); EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE EXPECT_EQ('?', codePoints[1]); EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT } // Redundant UTF-8 sequences must be rejected. EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty()); EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty()); EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty()); } TEST(Utf8UtilsTests, TestGetUtf8String) { { const std::vector<int> codePoints = {'t', 'e', 's', 't'}; EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); } { const std::vector<int> codePoints = { 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, 0x0430 /* CYRILLIC SMALL LETTER A */, 0x3042 /* HIRAGANA LETTER A */, 0x1F36A /* COOKIE */, 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */ }; EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); } } } // namespace } // namespace dicttoolkit } // namespace latinime Loading
native/dicttoolkit/NativeFileList.mk +5 −2 Original line number Diff line number Diff line Loading @@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \ makedict_executor.cpp) \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict.cpp) \ utils/command_utils.cpp $(addprefix utils/, \ command_utils.cpp \ utf8_utils.cpp) LATIN_IME_DICT_TOOLKIT_TEST_FILES := \ dict_toolkit_defines_test.cpp \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict_test.cpp) \ $(addprefix utils/, \ command_utils_test.cpp) command_utils_test.cpp \ utf8_utils_test.cpp)
native/dicttoolkit/src/utils/utf8_utils.cpp 0 → 100644 +119 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf8_utils.h" #include "utils/char_utils.h" namespace latinime { namespace dicttoolkit { const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; /* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) { std::vector<int> codePoints; int remainingByteCountForCurrentCodePoint = 0; int currentCodePointSequenceSize = 0; int codePoint = 0; for (const char c : utf8Str) { if (remainingByteCountForCurrentCodePoint == 0) { currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); if (currentCodePointSequenceSize <= 0) { AKLOGE("%x is an invalid utf8 first byte value.", c); return std::vector<int>(); } remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); } else { codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; codePoint += maskTrailingByte(c); } remainingByteCountForCurrentCodePoint--; if (remainingByteCountForCurrentCodePoint == 0) { if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", currentCodePointSequenceSize, codePoint); return std::vector<int>(); } codePoints.push_back(codePoint); } } return codePoints; } /* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { return i; } } // Not a valid utf8 char first byte. return -1; } /* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, const int sequenceSize) { return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; } /* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; } /* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { std::string utf8String; for (const int codePoint : codePoints) { const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); if (sequenceSize <= 0) { AKLOGE("Cannot encode code point (%d).", codePoint); return std::string(); } const int trailingByteCount = sequenceSize - 1; // Output first byte. const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize])); // Output second and later bytes. for (int i = 1; i < sequenceSize; ++i) { const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER)); } } return utf8String; } /* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { if (codePoint < 0) { return -1; } for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { return i; } } return -1; } } // namespace dicttoolkit } // namespace latinime
native/dicttoolkit/src/utils/utf8_utils.h 0 → 100644 +56 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H #define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H #include <cstdint> #include <string> #include <vector> #include "dict_toolkit_defines.h" #include "utils/int_array_view.h" namespace latinime { namespace dicttoolkit { class Utf8Utils { public: static std::vector<int> getCodePoints(const std::string &utf8Str); static std::string getUtf8String(const CodePointArrayView codePoints); private: DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils); // Values indexed by sequence size. static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; static const uint8_t FIRST_BYTE_MARKER_MASKS[]; static const uint8_t FIRST_BYTE_MARKERS[]; static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[]; static const int MAX_ENCODED_CODE_POINT_VALUES[]; static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK; static const uint8_t TRAILING_BYTE_MARKER; static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte); static int maskFirstByte(const uint8_t firstByte, const int encodeSize); static int maskTrailingByte(const uint8_t secondOrLaterByte); static int getSequenceSizeToEncodeCodePoint(const int codePoint); }; } // namespace dicttoolkit } // namespace latinime #endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
native/dicttoolkit/tests/utils/utf8_utils_test.cpp 0 → 100644 +85 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/utf8_utils.h" #include <gtest/gtest.h> #include <vector> #include "utils/int_array_view.h" namespace latinime { namespace dicttoolkit { namespace { TEST(Utf8UtilsTests, TestGetCodePoints) { { const std::vector<int> codePoints = Utf8Utils::getCodePoints(""); EXPECT_EQ(0u, codePoints.size()); } { const std::vector<int> codePoints = Utf8Utils::getCodePoints("test"); EXPECT_EQ(4u, codePoints.size()); EXPECT_EQ('t', codePoints[0]); EXPECT_EQ('e', codePoints[1]); EXPECT_EQ('s', codePoints[2]); EXPECT_EQ('t', codePoints[3]); } { const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410"); EXPECT_EQ(4u, codePoints.size()); EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A EXPECT_EQ('a', codePoints[1]); EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA } { const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752"); EXPECT_EQ(3u, codePoints.size()); EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE EXPECT_EQ('?', codePoints[1]); EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT } // Redundant UTF-8 sequences must be rejected. EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty()); EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty()); EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty()); } TEST(Utf8UtilsTests, TestGetUtf8String) { { const std::vector<int> codePoints = {'t', 'e', 's', 't'}; EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); } { const std::vector<int> codePoints = { 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, 0x0430 /* CYRILLIC SMALL LETTER A */, 0x3042 /* HIRAGANA LETTER A */, 0x1F36A /* COOKIE */, 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */ }; EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); } } } // namespace } // namespace dicttoolkit } // namespace latinime