Loading native/jni/src/char_utils.cpp +59 −61 Original line number Diff line number Diff line Loading @@ -26,71 +26,61 @@ struct LatinCapitalSmallPair { unsigned short small; }; // Generated from http://unicode.org/Public/UNIDATA/UnicodeData.txt // // 1. Run the following code. Bascially taken from // Dictionary::toLowerCase(unsigned short c) in dictionary.cpp. // Then, get the list of chars where cc != ccc. // // unsigned short c, cc, ccc, ccc2; // for (c = 0; c < 0xFFFF ; c++) { // if (c < NELEMS(BASE_CHARS)) { // cc = BASE_CHARS[c]; // } else { // cc = c; // } // // // tolower // int isBase = 0; // if (cc >='A' && cc <= 'Z') { // ccc = (cc | 0x20); // ccc2 = ccc; // isBase = 1; // } else if (cc > 0x7F) { // ccc = u_tolower(cc); // ccc2 = latin_tolower(cc); // } else { // ccc = cc; // ccc2 = ccc; // } // if (!isBase && cc != ccc) { // wprintf(L" 0x%04X => 0x%04X => 0x%04X %lc => %lc => %lc \n", // c, cc, ccc, c, cc, ccc); // //assert(ccc == ccc2); // } // } // // Initially, started with an empty latin_tolower() as below. // // unsigned short latin_tolower(unsigned short c) { // return c; // } // // // 2. Process the list obtained by 1 by the following perl script and apply // 'sort -u' as well. Get the SORTED_CHAR_MAP[]. // Note that '$1' in the perl script is 'cc' in the above C code. // // while(<>) { // / 0x\w* => 0x(\w*) =/; // open(HDL, "grep -iw ^" . $1 . " UnicodeData.txt | "); // $line = <HDL>; // chomp $line; // @cols = split(/;/, $line); // print " { 0x$1, 0x$cols[13] }, // $cols[1]\n"; // } // // // 3. Update the latin_tolower() function above with SORTED_CHAR_MAP. Enable // the assert(ccc == ccc2) above and confirm the function exits successfully. // // TODO: Regenerate this map by using the updated BASE_CHARS table in this file. /* * How to update the SORTED_CHAR_MAP[] array. * * 1. Download http://unicode.org/Public/UNIDATA/UnicodeData.txt * * 2. Have a latest version of ICU4C dev package installed * (Note: the current data has been generated with version 4.8) * $ apt-get install libicu-dev * * 3. Build the following code * (You need this file, char_utils.h, and defines.h) * $ g++ -o char_utils -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc */ #ifdef UPDATING_CHAR_UTILS #include <stdio.h> #include <unicode/uchar.h> // ICU4C extern "C" int main() { for (unsigned short c = 0; c < 0xFFFF; c++) { const unsigned short baseC = c < NELEMS(BASE_CHARS) ? BASE_CHARS[c] : c; if (baseC <= 0x7F) continue; const unsigned short icu4cLowerBaseC = u_tolower(baseC); const unsigned short myLowerBaseC = latin_tolower(baseC); if (baseC != icu4cLowerBaseC) { #ifdef CONFIRMING_CHAR_UTILS if (icu4cLowerBaseC != myLowerBaseC) { fprintf(stderr, "icu4cLowerBaseC != myLowerBaseC, 0x%04X, 0x%04X\n", icu4cLowerBaseC, myLowerBaseC); } #else // CONFIRMING_CHAR_UTILS printf("0x%04X, 0x%04X\n", baseC, icu4cLowerBaseC); #endif // CONFIRMING_CHAR_UTILS } } } #endif // UPDATING_CHAR_UTILS /* * 4. Process the list with UnicodeData.txt * (You need UnicodeData.txt in the current directory) * $ ./char_utils | sort -u | \ * perl -e 'open(FH, "UnicodeData.txt"); @buf = <FH>; close(FH); \ * while(<>){/0x(\w*), 0x(\w*)/; @lines = grep(/^$1/, @buf); @cols = split(/;/, $lines[0]); \ * print " { 0x$1, 0x$cols[13] }, // $cols[1]\n";}' * * 5. Update the SORTED_CHAR_MAP[] array below with the output above. * Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully. * $ g++ -o char_utils -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc * $ ./char_utils * $ */ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x00C4, 0x00E4 }, // LATIN CAPITAL LETTER A WITH DIAERESIS { 0x00C5, 0x00E5 }, // LATIN CAPITAL LETTER A WITH RING ABOVE { 0x00C6, 0x00E6 }, // LATIN CAPITAL LETTER AE { 0x00D0, 0x00F0 }, // LATIN CAPITAL LETTER ETH { 0x00D1, 0x00F1 }, // LATIN CAPITAL LETTER N WITH TILDE { 0x00D5, 0x00F5 }, // LATIN CAPITAL LETTER O WITH TILDE { 0x00D6, 0x00F6 }, // LATIN CAPITAL LETTER O WITH DIAERESIS { 0x00D8, 0x00F8 }, // LATIN CAPITAL LETTER O WITH STROKE Loading @@ -98,7 +88,6 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x00DE, 0x00FE }, // LATIN CAPITAL LETTER THORN { 0x0110, 0x0111 }, // LATIN CAPITAL LETTER D WITH STROKE { 0x0126, 0x0127 }, // LATIN CAPITAL LETTER H WITH STROKE { 0x0141, 0x0142 }, // LATIN CAPITAL LETTER L WITH STROKE { 0x014A, 0x014B }, // LATIN CAPITAL LETTER ENG { 0x0152, 0x0153 }, // LATIN CAPITAL LIGATURE OE { 0x0166, 0x0167 }, // LATIN CAPITAL LETTER T WITH STROKE Loading Loading @@ -322,6 +311,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x0520, 0x0521 }, // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK { 0x0522, 0x0523 }, // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK { 0x0524, 0x0525 }, // CYRILLIC CAPITAL LETTER PE WITH DESCENDER { 0x0526, 0x0527 }, // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER { 0x0531, 0x0561 }, // ARMENIAN CAPITAL LETTER AYB { 0x0532, 0x0562 }, // ARMENIAN CAPITAL LETTER BEN { 0x0533, 0x0563 }, // ARMENIAN CAPITAL LETTER GIM Loading Loading @@ -795,6 +785,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0xA65A, 0xA65B }, // CYRILLIC CAPITAL LETTER BLENDED YUS { 0xA65C, 0xA65D }, // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS { 0xA65E, 0xA65F }, // CYRILLIC CAPITAL LETTER YN { 0xA660, 0xA661 }, // CYRILLIC CAPITAL LETTER REVERSED TSE { 0xA662, 0xA663 }, // CYRILLIC CAPITAL LETTER SOFT DE { 0xA664, 0xA665 }, // CYRILLIC CAPITAL LETTER SOFT EL { 0xA666, 0xA667 }, // CYRILLIC CAPITAL LETTER SOFT EM Loading Loading @@ -860,6 +851,13 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0xA784, 0xA785 }, // LATIN CAPITAL LETTER INSULAR S { 0xA786, 0xA787 }, // LATIN CAPITAL LETTER INSULAR T { 0xA78B, 0xA78C }, // LATIN CAPITAL LETTER SALTILLO { 0xA78D, 0x0265 }, // LATIN CAPITAL LETTER TURNED H { 0xA790, 0xA791 }, // LATIN CAPITAL LETTER N WITH DESCENDER { 0xA7A0, 0xA7A1 }, // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE { 0xA7A2, 0xA7A3 }, // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE { 0xA7A4, 0xA7A5 }, // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE { 0xA7A6, 0xA7A7 }, // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE { 0xA7A8, 0xA7A9 }, // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE { 0xFF21, 0xFF41 }, // FULLWIDTH LATIN CAPITAL LETTER A { 0xFF22, 0xFF42 }, // FULLWIDTH LATIN CAPITAL LETTER B { 0xFF23, 0xFF43 }, // FULLWIDTH LATIN CAPITAL LETTER C Loading Loading
native/jni/src/char_utils.cpp +59 −61 Original line number Diff line number Diff line Loading @@ -26,71 +26,61 @@ struct LatinCapitalSmallPair { unsigned short small; }; // Generated from http://unicode.org/Public/UNIDATA/UnicodeData.txt // // 1. Run the following code. Bascially taken from // Dictionary::toLowerCase(unsigned short c) in dictionary.cpp. // Then, get the list of chars where cc != ccc. // // unsigned short c, cc, ccc, ccc2; // for (c = 0; c < 0xFFFF ; c++) { // if (c < NELEMS(BASE_CHARS)) { // cc = BASE_CHARS[c]; // } else { // cc = c; // } // // // tolower // int isBase = 0; // if (cc >='A' && cc <= 'Z') { // ccc = (cc | 0x20); // ccc2 = ccc; // isBase = 1; // } else if (cc > 0x7F) { // ccc = u_tolower(cc); // ccc2 = latin_tolower(cc); // } else { // ccc = cc; // ccc2 = ccc; // } // if (!isBase && cc != ccc) { // wprintf(L" 0x%04X => 0x%04X => 0x%04X %lc => %lc => %lc \n", // c, cc, ccc, c, cc, ccc); // //assert(ccc == ccc2); // } // } // // Initially, started with an empty latin_tolower() as below. // // unsigned short latin_tolower(unsigned short c) { // return c; // } // // // 2. Process the list obtained by 1 by the following perl script and apply // 'sort -u' as well. Get the SORTED_CHAR_MAP[]. // Note that '$1' in the perl script is 'cc' in the above C code. // // while(<>) { // / 0x\w* => 0x(\w*) =/; // open(HDL, "grep -iw ^" . $1 . " UnicodeData.txt | "); // $line = <HDL>; // chomp $line; // @cols = split(/;/, $line); // print " { 0x$1, 0x$cols[13] }, // $cols[1]\n"; // } // // // 3. Update the latin_tolower() function above with SORTED_CHAR_MAP. Enable // the assert(ccc == ccc2) above and confirm the function exits successfully. // // TODO: Regenerate this map by using the updated BASE_CHARS table in this file. /* * How to update the SORTED_CHAR_MAP[] array. * * 1. Download http://unicode.org/Public/UNIDATA/UnicodeData.txt * * 2. Have a latest version of ICU4C dev package installed * (Note: the current data has been generated with version 4.8) * $ apt-get install libicu-dev * * 3. Build the following code * (You need this file, char_utils.h, and defines.h) * $ g++ -o char_utils -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc */ #ifdef UPDATING_CHAR_UTILS #include <stdio.h> #include <unicode/uchar.h> // ICU4C extern "C" int main() { for (unsigned short c = 0; c < 0xFFFF; c++) { const unsigned short baseC = c < NELEMS(BASE_CHARS) ? BASE_CHARS[c] : c; if (baseC <= 0x7F) continue; const unsigned short icu4cLowerBaseC = u_tolower(baseC); const unsigned short myLowerBaseC = latin_tolower(baseC); if (baseC != icu4cLowerBaseC) { #ifdef CONFIRMING_CHAR_UTILS if (icu4cLowerBaseC != myLowerBaseC) { fprintf(stderr, "icu4cLowerBaseC != myLowerBaseC, 0x%04X, 0x%04X\n", icu4cLowerBaseC, myLowerBaseC); } #else // CONFIRMING_CHAR_UTILS printf("0x%04X, 0x%04X\n", baseC, icu4cLowerBaseC); #endif // CONFIRMING_CHAR_UTILS } } } #endif // UPDATING_CHAR_UTILS /* * 4. Process the list with UnicodeData.txt * (You need UnicodeData.txt in the current directory) * $ ./char_utils | sort -u | \ * perl -e 'open(FH, "UnicodeData.txt"); @buf = <FH>; close(FH); \ * while(<>){/0x(\w*), 0x(\w*)/; @lines = grep(/^$1/, @buf); @cols = split(/;/, $lines[0]); \ * print " { 0x$1, 0x$cols[13] }, // $cols[1]\n";}' * * 5. Update the SORTED_CHAR_MAP[] array below with the output above. * Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully. * $ g++ -o char_utils -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc * $ ./char_utils * $ */ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x00C4, 0x00E4 }, // LATIN CAPITAL LETTER A WITH DIAERESIS { 0x00C5, 0x00E5 }, // LATIN CAPITAL LETTER A WITH RING ABOVE { 0x00C6, 0x00E6 }, // LATIN CAPITAL LETTER AE { 0x00D0, 0x00F0 }, // LATIN CAPITAL LETTER ETH { 0x00D1, 0x00F1 }, // LATIN CAPITAL LETTER N WITH TILDE { 0x00D5, 0x00F5 }, // LATIN CAPITAL LETTER O WITH TILDE { 0x00D6, 0x00F6 }, // LATIN CAPITAL LETTER O WITH DIAERESIS { 0x00D8, 0x00F8 }, // LATIN CAPITAL LETTER O WITH STROKE Loading @@ -98,7 +88,6 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x00DE, 0x00FE }, // LATIN CAPITAL LETTER THORN { 0x0110, 0x0111 }, // LATIN CAPITAL LETTER D WITH STROKE { 0x0126, 0x0127 }, // LATIN CAPITAL LETTER H WITH STROKE { 0x0141, 0x0142 }, // LATIN CAPITAL LETTER L WITH STROKE { 0x014A, 0x014B }, // LATIN CAPITAL LETTER ENG { 0x0152, 0x0153 }, // LATIN CAPITAL LIGATURE OE { 0x0166, 0x0167 }, // LATIN CAPITAL LETTER T WITH STROKE Loading Loading @@ -322,6 +311,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0x0520, 0x0521 }, // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK { 0x0522, 0x0523 }, // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK { 0x0524, 0x0525 }, // CYRILLIC CAPITAL LETTER PE WITH DESCENDER { 0x0526, 0x0527 }, // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER { 0x0531, 0x0561 }, // ARMENIAN CAPITAL LETTER AYB { 0x0532, 0x0562 }, // ARMENIAN CAPITAL LETTER BEN { 0x0533, 0x0563 }, // ARMENIAN CAPITAL LETTER GIM Loading Loading @@ -795,6 +785,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0xA65A, 0xA65B }, // CYRILLIC CAPITAL LETTER BLENDED YUS { 0xA65C, 0xA65D }, // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS { 0xA65E, 0xA65F }, // CYRILLIC CAPITAL LETTER YN { 0xA660, 0xA661 }, // CYRILLIC CAPITAL LETTER REVERSED TSE { 0xA662, 0xA663 }, // CYRILLIC CAPITAL LETTER SOFT DE { 0xA664, 0xA665 }, // CYRILLIC CAPITAL LETTER SOFT EL { 0xA666, 0xA667 }, // CYRILLIC CAPITAL LETTER SOFT EM Loading Loading @@ -860,6 +851,13 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { { 0xA784, 0xA785 }, // LATIN CAPITAL LETTER INSULAR S { 0xA786, 0xA787 }, // LATIN CAPITAL LETTER INSULAR T { 0xA78B, 0xA78C }, // LATIN CAPITAL LETTER SALTILLO { 0xA78D, 0x0265 }, // LATIN CAPITAL LETTER TURNED H { 0xA790, 0xA791 }, // LATIN CAPITAL LETTER N WITH DESCENDER { 0xA7A0, 0xA7A1 }, // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE { 0xA7A2, 0xA7A3 }, // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE { 0xA7A4, 0xA7A5 }, // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE { 0xA7A6, 0xA7A7 }, // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE { 0xA7A8, 0xA7A9 }, // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE { 0xFF21, 0xFF41 }, // FULLWIDTH LATIN CAPITAL LETTER A { 0xFF22, 0xFF42 }, // FULLWIDTH LATIN CAPITAL LETTER B { 0xFF23, 0xFF43 }, // FULLWIDTH LATIN CAPITAL LETTER C Loading