Loading native/jni/NativeFileList.mk +1 −0 Original line number Diff line number Diff line Loading @@ -134,5 +134,6 @@ LATIN_IME_CORE_TEST_FILES := \ suggest/policyimpl/dictionary/utils/trie_map_test.cpp \ suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \ utils/autocorrection_threshold_utils_test.cpp \ utils/char_utils_test.cpp \ utils/int_array_view_test.cpp \ utils/time_keeper_test.cpp native/jni/src/utils/char_utils.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -1057,11 +1057,11 @@ static int compare_pair_capital(const void *a, const void *b) { - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); } /* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) { /* static */ int CharUtils::latin_tolower(const int c) { struct LatinCapitalSmallPair *p = static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); return p ? p->small : c; return p ? static_cast<int>(p->small) : c; } /* Loading native/jni/src/utils/char_utils.h +13 −13 Original line number Diff line number Diff line Loading @@ -27,20 +27,14 @@ namespace latinime { class CharUtils { public: static const std::vector<int> EMPTY_STRING; static AK_FORCE_INLINE bool isAsciiUpper(int c) { // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). return (c >= 'A' && c <= 'Z'); } static AK_FORCE_INLINE int toAsciiLower(int c) { return c - 'A' + 'a'; } static AK_FORCE_INLINE bool isAscii(int c) { return isascii(c) != 0; } static AK_FORCE_INLINE int toLowerCase(const int c) { if (isAsciiUpper(c)) { return toAsciiLower(c); Loading @@ -48,7 +42,7 @@ class CharUtils { if (isAscii(c)) { return c; } return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); return latin_tolower(c); } static AK_FORCE_INLINE int toBaseLowerCase(const int c) { Loading @@ -59,7 +53,6 @@ class CharUtils { // TODO: Do not hardcode here return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; } static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { int size = 0; for (; size < arraySize; ++size) { Loading Loading @@ -91,9 +84,6 @@ class CharUtils { return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; } static unsigned short latin_tolower(const unsigned short c); static const std::vector<int> EMPTY_STRING; // Returns updated code point count. Returns 0 when the code points cannot be marked as a // Beginning-of-Sentence. static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, Loading Loading @@ -125,6 +115,16 @@ class CharUtils { */ static const int BASE_CHARS_SIZE = 0x0500; static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; static AK_FORCE_INLINE bool isAscii(int c) { return isascii(c) != 0; } static AK_FORCE_INLINE int toAsciiLower(int c) { return c - 'A' + 'a'; } static int latin_tolower(const int c); }; } // namespace latinime #endif // LATINIME_CHAR_UTILS_H native/jni/tests/utils/char_utils_test.cpp 0 → 100644 +122 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/char_utils.h" #include <gtest/gtest.h> #include "defines.h" namespace latinime { namespace { TEST(CharUtilsTest, TestIsAsciiUpper) { EXPECT_TRUE(CharUtils::isAsciiUpper('A')); EXPECT_TRUE(CharUtils::isAsciiUpper('Z')); EXPECT_FALSE(CharUtils::isAsciiUpper('a')); EXPECT_FALSE(CharUtils::isAsciiUpper('z')); EXPECT_FALSE(CharUtils::isAsciiUpper('@')); EXPECT_FALSE(CharUtils::isAsciiUpper(' ')); EXPECT_FALSE(CharUtils::isAsciiUpper(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x3042 /* HIRAGANA LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToLowerCase) { EXPECT_EQ('a', CharUtils::toLowerCase('A')); EXPECT_EQ('z', CharUtils::toLowerCase('Z')); EXPECT_EQ('a', CharUtils::toLowerCase('a')); EXPECT_EQ('z', CharUtils::toLowerCase('z')); EXPECT_EQ('@', CharUtils::toLowerCase('@')); EXPECT_EQ(' ', CharUtils::toLowerCase(' ')); EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, CharUtils::toLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, CharUtils::toLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toLowerCase(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toLowerCase(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToBaseLowerCase) { EXPECT_EQ('a', CharUtils::toBaseLowerCase('A')); EXPECT_EQ('z', CharUtils::toBaseLowerCase('Z')); EXPECT_EQ('a', CharUtils::toBaseLowerCase('a')); EXPECT_EQ('z', CharUtils::toBaseLowerCase('z')); EXPECT_EQ('@', CharUtils::toBaseLowerCase('@')); EXPECT_EQ(' ', CharUtils::toBaseLowerCase(' ')); EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toBaseLowerCase(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseLowerCase(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToBaseCodePoint) { EXPECT_EQ('A', CharUtils::toBaseCodePoint('A')); EXPECT_EQ('Z', CharUtils::toBaseCodePoint('Z')); EXPECT_EQ('a', CharUtils::toBaseCodePoint('a')); EXPECT_EQ('z', CharUtils::toBaseCodePoint('z')); EXPECT_EQ('@', CharUtils::toBaseCodePoint('@')); EXPECT_EQ(' ', CharUtils::toBaseCodePoint(' ')); EXPECT_EQ('A', CharUtils::toBaseCodePoint(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ('a', CharUtils::toBaseCodePoint(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0410 /* CYRILLIC CAPITAL LETTER A */, CharUtils::toBaseCodePoint(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseCodePoint(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toBaseCodePoint(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseCodePoint(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestIsIntentionalOmissionCodePoint) { EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('\'')); EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('-')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('a')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('?')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('/')); } TEST(CharUtilsTest, TestIsInUnicodeSpace) { EXPECT_FALSE(CharUtils::isInUnicodeSpace(NOT_A_CODE_POINT)); EXPECT_FALSE(CharUtils::isInUnicodeSpace(CODE_POINT_BEGINNING_OF_SENTENCE)); EXPECT_TRUE(CharUtils::isInUnicodeSpace('a')); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x3042 /* HIRAGANA LETTER A */)); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x1F36A /* COOKIE */)); } } // namespace } // namespace latinime Loading
native/jni/NativeFileList.mk +1 −0 Original line number Diff line number Diff line Loading @@ -134,5 +134,6 @@ LATIN_IME_CORE_TEST_FILES := \ suggest/policyimpl/dictionary/utils/trie_map_test.cpp \ suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \ utils/autocorrection_threshold_utils_test.cpp \ utils/char_utils_test.cpp \ utils/int_array_view_test.cpp \ utils/time_keeper_test.cpp
native/jni/src/utils/char_utils.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -1057,11 +1057,11 @@ static int compare_pair_capital(const void *a, const void *b) { - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); } /* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) { /* static */ int CharUtils::latin_tolower(const int c) { struct LatinCapitalSmallPair *p = static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); return p ? p->small : c; return p ? static_cast<int>(p->small) : c; } /* Loading
native/jni/src/utils/char_utils.h +13 −13 Original line number Diff line number Diff line Loading @@ -27,20 +27,14 @@ namespace latinime { class CharUtils { public: static const std::vector<int> EMPTY_STRING; static AK_FORCE_INLINE bool isAsciiUpper(int c) { // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). return (c >= 'A' && c <= 'Z'); } static AK_FORCE_INLINE int toAsciiLower(int c) { return c - 'A' + 'a'; } static AK_FORCE_INLINE bool isAscii(int c) { return isascii(c) != 0; } static AK_FORCE_INLINE int toLowerCase(const int c) { if (isAsciiUpper(c)) { return toAsciiLower(c); Loading @@ -48,7 +42,7 @@ class CharUtils { if (isAscii(c)) { return c; } return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); return latin_tolower(c); } static AK_FORCE_INLINE int toBaseLowerCase(const int c) { Loading @@ -59,7 +53,6 @@ class CharUtils { // TODO: Do not hardcode here return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; } static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { int size = 0; for (; size < arraySize; ++size) { Loading Loading @@ -91,9 +84,6 @@ class CharUtils { return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; } static unsigned short latin_tolower(const unsigned short c); static const std::vector<int> EMPTY_STRING; // Returns updated code point count. Returns 0 when the code points cannot be marked as a // Beginning-of-Sentence. static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, Loading Loading @@ -125,6 +115,16 @@ class CharUtils { */ static const int BASE_CHARS_SIZE = 0x0500; static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; static AK_FORCE_INLINE bool isAscii(int c) { return isascii(c) != 0; } static AK_FORCE_INLINE int toAsciiLower(int c) { return c - 'A' + 'a'; } static int latin_tolower(const int c); }; } // namespace latinime #endif // LATINIME_CHAR_UTILS_H
native/jni/tests/utils/char_utils_test.cpp 0 → 100644 +122 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/char_utils.h" #include <gtest/gtest.h> #include "defines.h" namespace latinime { namespace { TEST(CharUtilsTest, TestIsAsciiUpper) { EXPECT_TRUE(CharUtils::isAsciiUpper('A')); EXPECT_TRUE(CharUtils::isAsciiUpper('Z')); EXPECT_FALSE(CharUtils::isAsciiUpper('a')); EXPECT_FALSE(CharUtils::isAsciiUpper('z')); EXPECT_FALSE(CharUtils::isAsciiUpper('@')); EXPECT_FALSE(CharUtils::isAsciiUpper(' ')); EXPECT_FALSE(CharUtils::isAsciiUpper(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x3042 /* HIRAGANA LETTER A */)); EXPECT_FALSE(CharUtils::isAsciiUpper(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToLowerCase) { EXPECT_EQ('a', CharUtils::toLowerCase('A')); EXPECT_EQ('z', CharUtils::toLowerCase('Z')); EXPECT_EQ('a', CharUtils::toLowerCase('a')); EXPECT_EQ('z', CharUtils::toLowerCase('z')); EXPECT_EQ('@', CharUtils::toLowerCase('@')); EXPECT_EQ(' ', CharUtils::toLowerCase(' ')); EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, CharUtils::toLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, CharUtils::toLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toLowerCase(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toLowerCase(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToBaseLowerCase) { EXPECT_EQ('a', CharUtils::toBaseLowerCase('A')); EXPECT_EQ('z', CharUtils::toBaseLowerCase('Z')); EXPECT_EQ('a', CharUtils::toBaseLowerCase('a')); EXPECT_EQ('z', CharUtils::toBaseLowerCase('z')); EXPECT_EQ('@', CharUtils::toBaseLowerCase('@')); EXPECT_EQ(' ', CharUtils::toBaseLowerCase(' ')); EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toBaseLowerCase(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseLowerCase(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestToBaseCodePoint) { EXPECT_EQ('A', CharUtils::toBaseCodePoint('A')); EXPECT_EQ('Z', CharUtils::toBaseCodePoint('Z')); EXPECT_EQ('a', CharUtils::toBaseCodePoint('a')); EXPECT_EQ('z', CharUtils::toBaseCodePoint('z')); EXPECT_EQ('@', CharUtils::toBaseCodePoint('@')); EXPECT_EQ(' ', CharUtils::toBaseCodePoint(' ')); EXPECT_EQ('A', CharUtils::toBaseCodePoint(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); EXPECT_EQ('a', CharUtils::toBaseCodePoint(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); EXPECT_EQ(0x0410 /* CYRILLIC CAPITAL LETTER A */, CharUtils::toBaseCodePoint(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, CharUtils::toBaseCodePoint(0x0430 /* CYRILLIC SMALL LETTER A */)); EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, CharUtils::toBaseCodePoint(0x3042 /* HIRAGANA LETTER A */)); EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseCodePoint(0x1F36A /* COOKIE */)); } TEST(CharUtilsTest, TestIsIntentionalOmissionCodePoint) { EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('\'')); EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('-')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('a')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('?')); EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('/')); } TEST(CharUtilsTest, TestIsInUnicodeSpace) { EXPECT_FALSE(CharUtils::isInUnicodeSpace(NOT_A_CODE_POINT)); EXPECT_FALSE(CharUtils::isInUnicodeSpace(CODE_POINT_BEGINNING_OF_SENTENCE)); EXPECT_TRUE(CharUtils::isInUnicodeSpace('a')); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x0410 /* CYRILLIC CAPITAL LETTER A */)); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x3042 /* HIRAGANA LETTER A */)); EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x1F36A /* COOKIE */)); } } // namespace } // namespace latinime