Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a71ed8ca authored by Keisuke Kuroynagi's avatar Keisuke Kuroynagi
Browse files

Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries.

Bug: 6669677

Change-Id: Ifb1adebc5305a930c80396f6b4ec31d84400a9dc
parent 2933f9e8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ LATIN_IME_CORE_SRC_FILES := \
        dic_nodes_cache.cpp) \
    $(addprefix suggest/core/dictionary/, \
        bigram_dictionary.cpp \
        binary_dictionary_bigrams_reading_utils.cpp \
        binary_dictionary_format_utils.cpp \
        binary_dictionary_header.cpp \
        binary_dictionary_header_reading_utils.cpp \
+1 −2
Original line number Diff line number Diff line
@@ -233,8 +233,7 @@ namespace latinime {
        return multiBigramMap->getBigramProbability(
                binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
    }
    return BinaryFormat::getBigramProbability(
            binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
    return ProbabilityUtils::backoff(unigramProbability);
}

///////////////////////////////////////
+21 −24
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "bigram_dictionary.h"

#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/dictionary.h"
@@ -100,12 +101,11 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
 * and the bigrams are used to boost unigram result scores, it makes little sense to
 * reduce their scope to the ones that match the first letter.
 */
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints,
        int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
@@ -116,21 +116,20 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
    }
    // If still no bigrams, we really don't have them!
    if (0 == pos) return 0;
    uint8_t bigramFlags;

    int bigramCount = 0;
    do {
        bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
        int bigramBuffer[MAX_WORD_LENGTH];
    int unigramProbability = 0;
        const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                &pos);
        const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
                bigramBuffer, &unigramProbability);
    int bigramBuffer[MAX_WORD_LENGTH];
    for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
            bigramsIt.hasNext(); /* no-op */) {
        bigramsIt.next();
        const int length = BinaryFormat::getWordAtAddress(
                mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(),
                MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);

        // inputSize == 0 means we are trying to find bigram predictions.
        if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
            const int bigramProbabilityTemp =
                    BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
            const int bigramProbabilityTemp = bigramsIt.getProbability();
            // Due to space constraints, the probability for bigrams is approximate - the lower the
            // unigram probability, the worse the precision. The theoritical maximum error in
            // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
@@ -142,7 +141,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
                    outputTypes);
            ++bigramCount;
        }
    } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
    }
    return min(bigramCount, MAX_RESULTS);
}

@@ -187,22 +186,20 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons

bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
        int length2) const {
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    if (0 == pos) return false;
    int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
            false /* forceLowerCaseSearch */);
    int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(),
            word2, length2, false /* forceLowerCaseSearch */);
    if (NOT_VALID_WORD == nextWordPos) return false;
    uint8_t bigramFlags;
    do {
        bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
        const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
                &pos);
        if (bigramPos == nextWordPos) {

    for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
            bigramsIt.hasNext(); /* no-op */) {
        bigramsIt.next();
        if (bigramsIt.getBigramPos() == nextWordPos) {
            return true;
        }
    } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
    }
    return false;
}

+2 −2
Original line number Diff line number Diff line
@@ -27,8 +27,8 @@ class BigramDictionary {
 public:
    BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);

    int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
            int *frequencies, int *outputTypes) const;
    int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize,
            int *outWords, int *frequencies, int *outputTypes) const;
    bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
    ~BigramDictionary();

+67 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H

#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"

namespace latinime {

class BinaryDictionaryBigramsIterator {
 public:
    BinaryDictionaryBigramsIterator(
            const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos)
            : mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0),
              mBigramPos(0), mHasNext(true) {}

    AK_FORCE_INLINE bool hasNext() const {
        return mHasNext;
    }

    AK_FORCE_INLINE void next() {
        mBigramFlags = BinaryDictionaryBigramsReadingUtils::getFlagsAndForwardPointer(
                mBinaryDictionaryInfo, &mPos);
        mBigramPos = BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
                mBinaryDictionaryInfo, mBigramFlags, &mPos);
        mHasNext = BinaryDictionaryBigramsReadingUtils::hasNext(mBigramFlags);
    }

    AK_FORCE_INLINE int getProbability() const {
        return BinaryDictionaryBigramsReadingUtils::getBigramProbability(mBigramFlags);
    }

    AK_FORCE_INLINE int getBigramPos() const {
        return mBigramPos;
    }

    AK_FORCE_INLINE int getFlags() const {
        return mBigramFlags;
    }

 private:
    DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator);

    const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
    int mPos;
    BinaryDictionaryBigramsReadingUtils::BigramFlags mBigramFlags;
    int mBigramPos;
    bool mHasNext;
};
} // namespace latinime
#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
Loading