Loading native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -18,7 +18,8 @@ namespace latinime { const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; } // namespace latinime native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +7 −4 Original line number Diff line number Diff line Loading @@ -135,7 +135,7 @@ class ByteArrayUtils { static AK_FORCE_INLINE int readCodePointAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint8_t firstByte = readUint8(buffer, *pos); if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { *pos += 1; return NOT_A_CODE_POINT; Loading Loading @@ -187,7 +187,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. writeUint24AndAdvancePosition(buffer, codePoint, pos); } else { Loading @@ -207,7 +208,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. byteCount += 3; } else { Loading @@ -225,7 +227,8 @@ class ByteArrayUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, Loading tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +39 −1 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ package com.android.inputmethod.latin; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.DictEncoder; import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FusionDictionary; Loading @@ -30,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Locale; import java.util.Random; @LargeTest public class BinaryDictionaryTests extends AndroidTestCase { Loading Loading @@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase { assertEquals(probability, binaryDictionary.getFrequency("aab")); assertEquals(probability, binaryDictionary.getFrequency("aac")); assertEquals(probability, binaryDictionary.getFrequency("aac")); assertEquals(probability, binaryDictionary.getFrequency("aa")); assertEquals(probability, binaryDictionary.getFrequency("aaaa")); assertEquals(probability, binaryDictionary.getFrequency("a")); assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); dictFile.delete(); } public void testRandomlyAddUnigramWord() { final int wordCount = 1000; final int codePointSetSize = 50; final int seed = 123456789; File dictFile = null; try { dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); } catch (UnsupportedFormatException e) { fail("UnsupportedFormatException while writing an initial dictionary : " + e); } BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>(); // Test a word that isn't contained within the dictionary. final Random random = new Random(seed); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); for (int i = 0; i < wordCount; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); probabilityMap.put(word, random.nextInt() & 0xFF); } for (String word : probabilityMap.keySet()) { binaryDictionary.addUnigramWord(word, probabilityMap.get(word)); } for (String word : probabilityMap.keySet()) { assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); } dictFile.delete(); } public void testAddBigramWords() { Loading tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +6 −45 Original line number Diff line number Diff line Loading @@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { Log.e(TAG, "Testing dictionary: seed is " + seed); final Random random = new Random(seed); sWords.clear(); final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); generateWords(maxUnigrams, random, codePointSet); for (int i = 0; i < sWords.size(); ++i) { Loading @@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { } } private int[] generateCodePointSet(final int codePointSetSize, final Random random) { final int[] codePointSet = new int[codePointSetSize]; for (int i = codePointSet.length - 1; i >= 0; ) { final int r = Math.abs(random.nextInt()); if (r < 0) continue; // Don't insert 0~0x20, but insert any other code point. // Code points are in the range 0~0x10FFFF. final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); // Code points between MIN_ and MAX_SURROGATE are not valid on their own. if (candidateCodePoint >= Character.MIN_SURROGATE && candidateCodePoint <= Character.MAX_SURROGATE) continue; codePointSet[i] = candidateCodePoint; --i; } return codePointSet; } // Utilities for test /** * Generates a random word. */ private String generateWord(final Random random, final int[] codePointSet) { StringBuilder builder = new StringBuilder(); // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward // longer words. This should be closer to natural language, and more importantly, it will // exercise the algorithms in dicttool much more. final int count = 1 + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5); while (builder.length() < count) { builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); } return builder.toString(); } private void generateWords(final int number, final Random random, final int[] codePointSet) { final Set<String> wordSet = CollectionUtils.newHashSet(); while (wordSet.size() < number) { wordSet.add(generateWord(random, codePointSet)); wordSet.add(CodePointUtils.generateWord(random, codePointSet)); } sWords.addAll(wordSet); } Loading Loading @@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { // Test a word that isn't contained within the dictionary. final Random random = new Random((int)System.currentTimeMillis()); final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); for (int i = 0; i < 1000; ++i) { final String word = generateWord(random, codePointSet); final String word = CodePointUtils.generateWord(random, codePointSet); if (sWords.indexOf(word) != -1) continue; runGetTerminalPosition(dictDecoder, word, i, false); } Loading tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java 0 → 100644 +65 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import java.util.Random; // Utility methods related with code points used for tests. public class CodePointUtils { private CodePointUtils() { // This utility class is not publicly instantiable. } public static int[] generateCodePointSet(final int codePointSetSize, final Random random) { final int[] codePointSet = new int[codePointSetSize]; for (int i = codePointSet.length - 1; i >= 0; ) { final int r = Math.abs(random.nextInt()); if (r < 0) continue; // Don't insert 0~0x20, but insert any other code point. // Code points are in the range 0~0x10FFFF. final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); // Code points between MIN_ and MAX_SURROGATE are not valid on their own. if (candidateCodePoint >= Character.MIN_SURROGATE && candidateCodePoint <= Character.MAX_SURROGATE) continue; codePointSet[i] = candidateCodePoint; --i; } return codePointSet; } /** * Generates a random word. */ public static String generateWord(final Random random, final int[] codePointSet) { StringBuilder builder = new StringBuilder(); // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward // longer words. This should be closer to natural language, and more importantly, it will // exercise the algorithms in dicttool much more. final int count = 1 + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5); while (builder.length() < count) { builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); } return builder.toString(); } } Loading
native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -18,7 +18,8 @@ namespace latinime { const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; } // namespace latinime
native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +7 −4 Original line number Diff line number Diff line Loading @@ -135,7 +135,7 @@ class ByteArrayUtils { static AK_FORCE_INLINE int readCodePointAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint8_t firstByte = readUint8(buffer, *pos); if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { *pos += 1; return NOT_A_CODE_POINT; Loading Loading @@ -187,7 +187,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. writeUint24AndAdvancePosition(buffer, codePoint, pos); } else { Loading @@ -207,7 +208,8 @@ class ByteArrayUtils { const int codePoint = codePoints[i]; if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { break; } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { // three bytes character. byteCount += 3; } else { Loading @@ -225,7 +227,8 @@ class ByteArrayUtils { private: DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, Loading
tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +39 −1 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ package com.android.inputmethod.latin; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.DictEncoder; import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FusionDictionary; Loading @@ -30,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Locale; import java.util.Random; @LargeTest public class BinaryDictionaryTests extends AndroidTestCase { Loading Loading @@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase { assertEquals(probability, binaryDictionary.getFrequency("aab")); assertEquals(probability, binaryDictionary.getFrequency("aac")); assertEquals(probability, binaryDictionary.getFrequency("aac")); assertEquals(probability, binaryDictionary.getFrequency("aa")); assertEquals(probability, binaryDictionary.getFrequency("aaaa")); assertEquals(probability, binaryDictionary.getFrequency("a")); assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa")); dictFile.delete(); } public void testRandomlyAddUnigramWord() { final int wordCount = 1000; final int codePointSetSize = 50; final int seed = 123456789; File dictFile = null; try { dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); } catch (IOException e) { fail("IOException while writing an initial dictionary : " + e); } catch (UnsupportedFormatException e) { fail("UnsupportedFormatException while writing an initial dictionary : " + e); } BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>(); // Test a word that isn't contained within the dictionary. final Random random = new Random(seed); final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); for (int i = 0; i < wordCount; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); probabilityMap.put(word, random.nextInt() & 0xFF); } for (String word : probabilityMap.keySet()) { binaryDictionary.addUnigramWord(word, probabilityMap.get(word)); } for (String word : probabilityMap.keySet()) { assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word)); } dictFile.delete(); } public void testAddBigramWords() { Loading
tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +6 −45 Original line number Diff line number Diff line Loading @@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { Log.e(TAG, "Testing dictionary: seed is " + seed); final Random random = new Random(seed); sWords.clear(); final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); generateWords(maxUnigrams, random, codePointSet); for (int i = 0; i < sWords.size(); ++i) { Loading @@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { } } private int[] generateCodePointSet(final int codePointSetSize, final Random random) { final int[] codePointSet = new int[codePointSetSize]; for (int i = codePointSet.length - 1; i >= 0; ) { final int r = Math.abs(random.nextInt()); if (r < 0) continue; // Don't insert 0~0x20, but insert any other code point. // Code points are in the range 0~0x10FFFF. final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); // Code points between MIN_ and MAX_SURROGATE are not valid on their own. if (candidateCodePoint >= Character.MIN_SURROGATE && candidateCodePoint <= Character.MAX_SURROGATE) continue; codePointSet[i] = candidateCodePoint; --i; } return codePointSet; } // Utilities for test /** * Generates a random word. */ private String generateWord(final Random random, final int[] codePointSet) { StringBuilder builder = new StringBuilder(); // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward // longer words. This should be closer to natural language, and more importantly, it will // exercise the algorithms in dicttool much more. final int count = 1 + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5); while (builder.length() < count) { builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); } return builder.toString(); } private void generateWords(final int number, final Random random, final int[] codePointSet) { final Set<String> wordSet = CollectionUtils.newHashSet(); while (wordSet.size() < number) { wordSet.add(generateWord(random, codePointSet)); wordSet.add(CodePointUtils.generateWord(random, codePointSet)); } sWords.addAll(wordSet); } Loading Loading @@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { // Test a word that isn't contained within the dictionary. final Random random = new Random((int)System.currentTimeMillis()); final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); for (int i = 0; i < 1000; ++i) { final String word = generateWord(random, codePointSet); final String word = CodePointUtils.generateWord(random, codePointSet); if (sWords.indexOf(word) != -1) continue; runGetTerminalPosition(dictDecoder, word, i, false); } Loading
tests/src/com/android/inputmethod/latin/makedict/CodePointUtils.java 0 → 100644 +65 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import java.util.Random; // Utility methods related with code points used for tests. public class CodePointUtils { private CodePointUtils() { // This utility class is not publicly instantiable. } public static int[] generateCodePointSet(final int codePointSetSize, final Random random) { final int[] codePointSet = new int[codePointSetSize]; for (int i = codePointSet.length - 1; i >= 0; ) { final int r = Math.abs(random.nextInt()); if (r < 0) continue; // Don't insert 0~0x20, but insert any other code point. // Code points are in the range 0~0x10FFFF. final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20); // Code points between MIN_ and MAX_SURROGATE are not valid on their own. if (candidateCodePoint >= Character.MIN_SURROGATE && candidateCodePoint <= Character.MAX_SURROGATE) continue; codePointSet[i] = candidateCodePoint; --i; } return codePointSet; } /** * Generates a random word. */ public static String generateWord(final Random random, final int[] codePointSet) { StringBuilder builder = new StringBuilder(); // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward // longer words. This should be closer to natural language, and more importantly, it will // exercise the algorithms in dicttool much more. final int count = 1 + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5) + (Math.abs(random.nextInt()) % 5); while (builder.length() < count) { builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]); } return builder.toString(); } }