Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e0e67373 authored by Satoshi Kataoka's avatar Satoshi Kataoka
Browse files

Refactor parameters by naming convention

Change-Id: I8bda8075b33f656ecbec08320afcd864b620fe77
parent a5067333
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -91,7 +91,7 @@ public final class BinaryDictionary extends Dictionary {

    private static native long openNative(String sourceDir, long dictOffset, long dictSize);
    private static native void closeNative(long dict);
    private static native int getFrequencyNative(long dict, int[] word);
    private static native int getProbabilityNative(long dict, int[] word);
    private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2);
    private static native int getSuggestionsNative(long dict, long proximityInfo,
            long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
@@ -186,7 +186,7 @@ public final class BinaryDictionary extends Dictionary {
    public int getFrequency(final String word) {
        if (word == null) return -1;
        int[] codePoints = StringUtils.toCodePointArray(word);
        return getFrequencyNative(mNativeDict, codePoints);
        return getProbabilityNative(mNativeDict, codePoints);
    }

    // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni
+4 −4
Original line number Diff line number Diff line
@@ -203,14 +203,14 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j
    return count;
}

static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jclass clazz, jlong dict,
static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict,
        jintArray wordArray) {
    Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
    if (!dictionary) return 0;
    const jsize codePointLength = env->GetArrayLength(wordArray);
    int codePoints[codePointLength];
    env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints);
    return dictionary->getFrequency(codePoints, codePointLength);
    return dictionary->getProbability(codePoints, codePointLength);
}

static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict,
@@ -285,8 +285,8 @@ static JNINativeMethod sMethods[] = {
    {"closeNative", "(J)V", reinterpret_cast<void *>(latinime_BinaryDictionary_close)},
    {"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I",
            reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)},
    {"getFrequencyNative", "(J[I)I",
            reinterpret_cast<void *>(latinime_BinaryDictionary_getFrequency)},
    {"getProbabilityNative", "(J[I)I",
            reinterpret_cast<void *>(latinime_BinaryDictionary_getProbability)},
    {"isValidBigramNative", "(J[I[I)Z",
            reinterpret_cast<void *>(latinime_BinaryDictionary_isValidBigram)},
    {"calcNormalizedScoreNative", "([I[II)F",
+23 −22
Original line number Diff line number Diff line
@@ -36,21 +36,21 @@ BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT
BigramDictionary::~BigramDictionary() {
}

void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq,
void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
        int *bigramCodePoints, int *outputTypes) const {
    word[length] = 0;
    if (DEBUG_DICT) {
#ifdef FLAG_DBG
        char s[length + 1];
        for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
        AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
        AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
#endif
    }

    // Find the right insertion point
    int insertAt = 0;
    while (insertAt < MAX_RESULTS) {
        if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency
        if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
                && length < getCodePointCount(MAX_WORD_LENGTH,
                        bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
            break;
@@ -63,10 +63,10 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
    if (insertAt >= MAX_RESULTS) {
        return;
    }
    memmove(bigramFreq + (insertAt + 1),
            bigramFreq + insertAt,
            (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0]));
    bigramFreq[insertAt] = frequency;
    memmove(bigramProbability + (insertAt + 1),
            bigramProbability + insertAt,
            (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
    bigramProbability[insertAt] = probability;
    outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
    memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
            bigramCodePoints + insertAt * MAX_WORD_LENGTH,
@@ -87,7 +87,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
 * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
 * inputSize: the size of the codes array.
 * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
 * bigramFreq: an array to output frequencies.
 * bigramProbability: an array to output frequencies.
 * outputTypes: an array to output types.
 * This method returns the number of bigrams this word has, for backward compatibility.
 * Note: this is not the number of bigrams output in the array, which is the number of
@@ -98,7 +98,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *
 * reduce their scope to the ones that match the first letter.
 */
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
        int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const {
        int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

@@ -118,23 +118,24 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
    do {
        bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
        int bigramBuffer[MAX_WORD_LENGTH];
        int unigramFreq = 0;
        int unigramProbability = 0;
        const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                &pos);
        const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
                bigramBuffer, &unigramFreq);
                bigramBuffer, &unigramProbability);

        // inputSize == 0 means we are trying to find bigram predictions.
        if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
            const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
            // Due to space constraints, the frequency for bigrams is approximate - the lower the
            // unigram frequency, the worse the precision. The theoritical maximum error in
            // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
            const int bigramProbabilityTemp =
                    BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
            // Due to space constraints, the probability for bigrams is approximate - the lower the
            // unigram probability, the worse the precision. The theoritical maximum error in
            // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
            // in very bad cases. This means that sometimes, we'll see some bigrams interverted
            // here, but it can't get too bad.
            const int frequency =
                    BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp);
            addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints,
            const int probability = BinaryFormat::computeProbabilityForBigram(
                    unigramProbability, bigramProbabilityTemp);
            addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints,
                    outputTypes);
            ++bigramCount;
        }
@@ -159,13 +160,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
    } else {
        pos = BinaryFormat::skipOtherCharacters(root, pos);
    }
    pos = BinaryFormat::skipFrequency(flags, pos);
    pos = BinaryFormat::skipProbability(flags, pos);
    pos = BinaryFormat::skipChildrenPosition(flags, pos);
    pos = BinaryFormat::skipShortcuts(root, flags, pos);
    return pos;
}

void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord,
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    const uint8_t *const root = DICT_ROOT;
@@ -181,10 +182,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW
    uint8_t bigramFlags;
    do {
        bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
        const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
        const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
        const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                &pos);
        (*map)[bigramPos] = frequency;
        (*map)[bigramPos] = probability;
        setInFilter(filter, bigramPos);
    } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
}
+3 −3
Original line number Diff line number Diff line
@@ -29,14 +29,14 @@ class BigramDictionary {
    BigramDictionary(const uint8_t *const streamStart);
    int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
            int *frequencies, int *outputTypes) const;
    void fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength,
    void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
            std::map<int, int> *map, uint8_t *filter) const;
    bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
    ~BigramDictionary();
 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
    void addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints,
            int *outputTypes) const;
    void addWordBigram(int *word, int length, int probability, int *bigramProbability,
            int *bigramCodePoints, int *outputTypes) const;
    bool checkFirstCharacter(int *word, int *inputCodePoints) const;
    int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
            const bool forceLowerCaseSearch) const;
+44 −40
Original line number Diff line number Diff line
@@ -52,10 +52,10 @@ class BinaryFormat {
    // Flag for sign of offset. If this flag is set, the offset value must be negated.
    static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;

    // Mask for attribute frequency, stored on 4 bits inside the flags byte.
    static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
    // The numeric value of the shortcut frequency that means 'whitelist'.
    static const int WHITELIST_SHORTCUT_FREQUENCY = 15;
    // Mask for attribute probability, stored on 4 bits inside the flags byte.
    static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F;
    // The numeric value of the shortcut probability that means 'whitelist'.
    static const int WHITELIST_SHORTCUT_PROBABILITY = 15;

    // Mask and flags for attribute address type selection.
    static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
@@ -72,10 +72,10 @@ class BinaryFormat {
    static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
    static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
    static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
    static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
    static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos);
    static int skipOtherCharacters(const uint8_t *const dict, const int pos);
    static int skipChildrenPosition(const uint8_t flags, const int pos);
    static int skipFrequency(const uint8_t flags, const int pos);
    static int skipProbability(const uint8_t flags, const int pos);
    static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos);
    static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags,
            const int pos);
@@ -83,14 +83,15 @@ class BinaryFormat {
    static bool hasChildrenInFlags(const uint8_t flags);
    static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags,
            int *pos);
    static int getAttributeFrequencyFromFlags(const int flags);
    static int getAttributeProbabilityFromFlags(const int flags);
    static int getTerminalPosition(const uint8_t *const root, const int *const inWord,
            const int length, const bool forceLowerCaseSearch);
    static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
            int *outWord, int *outUnigramFrequency);
    static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
            int *outWord, int *outUnigramProbability);
    static int computeProbabilityForBigram(
            const int unigramProbability, const int bigramProbability);
    static int getProbability(const int position, const std::map<int, int> *bigramMap,
            const uint8_t *bigramFilter, const int unigramFreq);
            const uint8_t *bigramFilter, const int unigramProbability);

    // Flags for special processing
    // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c
    }
}

inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict,
inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict,
        const int pos) {
    return dict[pos];
}
@@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos
    return pos + childrenAddressSize(flags);
}

inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) {
    return FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}

@@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin
    }
}

inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) {
    return flags & MASK_ATTRIBUTE_FREQUENCY;
inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) {
    return flags & MASK_ATTRIBUTE_PROBABILITY;
}

// This function gets the byte position of the last chargroup of the exact matching word in the
@@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
                    if (wordPos == length) {
                        return charGroupPos;
                    }
                    pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos);
                    pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos);
                }
                if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) {
                    return NOT_VALID_WORD;
@@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                    pos = BinaryFormat::skipOtherCharacters(root, pos);
                }
                pos = BinaryFormat::skipFrequency(flags, pos);
                pos = BinaryFormat::skipProbability(flags, pos);
                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
            }
            --charGroupCount;
@@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root,
 * address: the byte position of the last chargroup of the word we are searching for (this is
 *   what is stored as the "bigram address" in each bigram)
 * outword: an array to write the found word, with MAX_WORD_LENGTH size.
 * outUnigramFrequency: a pointer to an int to write the frequency into.
 * outUnigramProbability: a pointer to an int to write the probability into.
 * Return value : the length of the word, of 0 if the word was not found.
 */
AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address,
        const int maxDepth, int *outWord, int *outUnigramFrequency) {
        const int maxDepth, int *outWord, int *outUnigramProbability) {
    int pos = 0;
    int wordPos = 0;

@@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
                        nextChar = getCodePointAndForwardPointer(root, &pos);
                    }
                }
                *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
                *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos);
                return ++wordPos;
            }
            // We need to skip past this char group, so skip any remaining chars after the
            // first and possibly the frequency.
            // first and possibly the probability.
            if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                pos = skipOtherCharacters(root, pos);
            }
            pos = skipFrequency(flags, pos);
            pos = skipProbability(flags, pos);

            // The fact that this group has children is very important. Since we already know
            // that this group does not match, if it has no children we know it is irrelevant
@@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
                        }
                    }
                    ++wordPos;
                    // Now we only need to branch to the children address. Skip the frequency if
                    // Now we only need to branch to the children address. Skip the probability if
                    // it's there, read pos, and break to resume the search at pos.
                    lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos);
                    lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos);
                    pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos);
                    break;
                } else {
@@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
    return 0;
}

static inline int backoff(const int unigramFreq) {
    return unigramFreq;
static inline int backoff(const int unigramProbability) {
    return unigramProbability;
    // For some reason, applying the backoff weight gives bad results in tests. To apply the
    // backoff weight, we divide the probability by 2, which in our storing format means
    // decreasing the score by 8.
    // TODO: figure out what's wrong with this.
    // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
    // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8);
}

inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
    // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
    // unigram frequency to be the median value of the 17th step from the top. A value of
    // 0 for the bigram frequency represents the middle of the 16th step from the top,
inline int BinaryFormat::computeProbabilityForBigram(
        const int unigramProbability, const int bigramProbability) {
    // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the
    // unigram probability to be the median value of the 17th step from the top. A value of
    // 0 for the bigram probability represents the middle of the 16th step from the top,
    // while a value of 15 represents the middle of the top step.
    // See makedict.BinaryDictInputOutput for details.
    const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
    return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize);
    const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability)
            / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY);
    return unigramProbability
            + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
}

// This returns a probability in log space.
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
        const uint8_t *bigramFilter, const int unigramFreq) {
    if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
    if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
    const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
    if (bigramFreqIt != bigramMap->end()) {
        const int bigramFreq = bigramFreqIt->second;
        return computeFrequencyForBigram(unigramFreq, bigramFreq);
    }
    return backoff(unigramFreq);
        const uint8_t *bigramFilter, const int unigramProbability) {
    if (!bigramMap || !bigramFilter) return backoff(unigramProbability);
    if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability);
    const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
    if (bigramProbabilityIt != bigramMap->end()) {
        const int bigramProbability = bigramProbabilityIt->second;
        return computeProbabilityForBigram(unigramProbability, bigramProbability);
    }
    return backoff(unigramProbability);
}
} // namespace latinime
#endif // LATINIME_BINARY_FORMAT_H
Loading