Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1517b68b authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android Git Automerger
Browse files

am 280fb1a1: Merge "Fix handling multi-bytes characters and add a test."

* commit '280fb1a1':
  Fix handling multi-bytes characters and add a test.
parents 50ba8769 280fb1a1
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -18,7 +18,8 @@

namespace latinime {

const uint8_t ByteArrayUtils::MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20;
const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF;
const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F;

} // namespace latinime
+7 −4
Original line number Diff line number Diff line
@@ -135,7 +135,7 @@ class ByteArrayUtils {
    static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
            const uint8_t *const buffer, int *const pos) {
        const uint8_t firstByte = readUint8(buffer, *pos);
        if (firstByte < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
        if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
            if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
                *pos += 1;
                return NOT_A_CODE_POINT;
@@ -187,7 +187,8 @@ class ByteArrayUtils {
            const int codePoint = codePoints[i];
            if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
                break;
            } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
                // three bytes character.
                writeUint24AndAdvancePosition(buffer, codePoint, pos);
            } else {
@@ -207,7 +208,8 @@ class ByteArrayUtils {
            const int codePoint = codePoints[i];
            if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) {
                break;
            } else if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
            } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE
                    || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) {
                // three bytes character.
                byteCount += 3;
            } else {
@@ -225,7 +227,8 @@ class ByteArrayUtils {
 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils);

    static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE;
    static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE;
    static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE;
    static const uint8_t CHARACTER_ARRAY_TERMINATOR;

    static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer,
+39 −1
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ package com.android.inputmethod.latin;
import android.test.AndroidTestCase;
import android.test.suitebuilder.annotation.LargeTest;

import com.android.inputmethod.latin.makedict.CodePointUtils;
import com.android.inputmethod.latin.makedict.DictEncoder;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FusionDictionary;
@@ -30,6 +31,7 @@ import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Random;

@LargeTest
public class BinaryDictionaryTests extends AndroidTestCase {
@@ -117,10 +119,46 @@ public class BinaryDictionaryTests extends AndroidTestCase {

        assertEquals(probability, binaryDictionary.getFrequency("aab"));
        assertEquals(probability, binaryDictionary.getFrequency("aac"));
        assertEquals(probability, binaryDictionary.getFrequency("aac"));
        assertEquals(probability, binaryDictionary.getFrequency("aa"));
        assertEquals(probability, binaryDictionary.getFrequency("aaaa"));
        assertEquals(probability, binaryDictionary.getFrequency("a"));
        assertEquals(updatedProbability, binaryDictionary.getFrequency("aaa"));

        dictFile.delete();
    }

    public void testRandomlyAddUnigramWord() {
        final int wordCount = 1000;
        final int codePointSetSize = 50;
        final int seed = 123456789;

        File dictFile = null;
        try {
            dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
        } catch (IOException e) {
            fail("IOException while writing an initial dictionary : " + e);
        } catch (UnsupportedFormatException e) {
            fail("UnsupportedFormatException while writing an initial dictionary : " + e);
        }
        BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
                0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
                Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);

        final HashMap<String, Integer> probabilityMap = new HashMap<String, Integer>();
        // Test a word that isn't contained within the dictionary.
        final Random random = new Random(seed);
        final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
        for (int i = 0; i < wordCount; ++i) {
            final String word = CodePointUtils.generateWord(random, codePointSet);
            probabilityMap.put(word, random.nextInt() & 0xFF);
        }
        for (String word : probabilityMap.keySet()) {
            binaryDictionary.addUnigramWord(word, probabilityMap.get(word));
        }
        for (String word : probabilityMap.keySet()) {
            assertEquals(word, (int)probabilityMap.get(word), binaryDictionary.getFrequency(word));
        }
        dictFile.delete();
    }

    public void testAddBigramWords() {
+6 −45
Original line number Diff line number Diff line
@@ -87,7 +87,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
        Log.e(TAG, "Testing dictionary: seed is " + seed);
        final Random random = new Random(seed);
        sWords.clear();
        final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
        final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
                random);
        generateWords(maxUnigrams, random, codePointSet);

        for (int i = 0; i < sWords.size(); ++i) {
@@ -113,51 +114,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
        }
    }

    private int[] generateCodePointSet(final int codePointSetSize, final Random random) {
        final int[] codePointSet = new int[codePointSetSize];
        for (int i = codePointSet.length - 1; i >= 0; ) {
            final int r = Math.abs(random.nextInt());
            if (r < 0) continue;
            // Don't insert 0~0x20, but insert any other code point.
            // Code points are in the range 0~0x10FFFF.
            final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
            // Code points between MIN_ and MAX_SURROGATE are not valid on their own.
            if (candidateCodePoint >= Character.MIN_SURROGATE
                    && candidateCodePoint <= Character.MAX_SURROGATE) continue;
            codePointSet[i] = candidateCodePoint;
            --i;
        }
        return codePointSet;
    }

    // Utilities for test

    /**
     * Generates a random word.
     */
    private String generateWord(final Random random, final int[] codePointSet) {
        StringBuilder builder = new StringBuilder();
        // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
        // longer words. This should be closer to natural language, and more importantly, it will
        // exercise the algorithms in dicttool much more.
        final int count = 1 + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5);
        while (builder.length() < count) {
            builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
        }
        return builder.toString();
    }

    private void generateWords(final int number, final Random random, final int[] codePointSet) {
        final Set<String> wordSet = CollectionUtils.newHashSet();
        while (wordSet.size() < number) {
            wordSet.add(generateWord(random, codePointSet));
            wordSet.add(CodePointUtils.generateWord(random, codePointSet));
        }
        sWords.addAll(wordSet);
    }
@@ -606,9 +566,10 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {

        // Test a word that isn't contained within the dictionary.
        final Random random = new Random((int)System.currentTimeMillis());
        final int[] codePointSet = generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random);
        final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE,
                random);
        for (int i = 0; i < 1000; ++i) {
            final String word = generateWord(random, codePointSet);
            final String word = CodePointUtils.generateWord(random, codePointSet);
            if (sWords.indexOf(word) != -1) continue;
            runGetTerminalPosition(dictDecoder, word, i, false);
        }
+65 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import java.util.Random;

// Utility methods related with code points used for tests.
public class CodePointUtils {
    private CodePointUtils() {
        // This utility class is not publicly instantiable.
    }

    public static int[] generateCodePointSet(final int codePointSetSize, final Random random) {
        final int[] codePointSet = new int[codePointSetSize];
        for (int i = codePointSet.length - 1; i >= 0; ) {
            final int r = Math.abs(random.nextInt());
            if (r < 0) continue;
            // Don't insert 0~0x20, but insert any other code point.
            // Code points are in the range 0~0x10FFFF.
            final int candidateCodePoint = 0x20 + r % (Character.MAX_CODE_POINT - 0x20);
            // Code points between MIN_ and MAX_SURROGATE are not valid on their own.
            if (candidateCodePoint >= Character.MIN_SURROGATE
                    && candidateCodePoint <= Character.MAX_SURROGATE) continue;
            codePointSet[i] = candidateCodePoint;
            --i;
        }
        return codePointSet;
    }

    /**
     * Generates a random word.
     */
    public static String generateWord(final Random random, final int[] codePointSet) {
        StringBuilder builder = new StringBuilder();
        // 8 * 4 = 32 chars max, but we do it the following way so as to bias the random toward
        // longer words. This should be closer to natural language, and more importantly, it will
        // exercise the algorithms in dicttool much more.
        final int count = 1 + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5)
                + (Math.abs(random.nextInt()) % 5);
        while (builder.length() < count) {
            builder.appendCodePoint(codePointSet[Math.abs(random.nextInt()) % codePointSet.length]);
        }
        return builder.toString();
    }
}