Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 07e29d57 authored by satok's avatar satok Committed by Android (Google) Code Review
Browse files

Merge "Make native bigram dictionary const"

parents a19c5e63 b1ed1d47
Loading
Loading
Loading
Loading
+29 −33
Original line number Diff line number Diff line
@@ -27,9 +27,8 @@

namespace latinime {

BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
        Dictionary *parentDictionary)
    : DICT(dict), MAX_WORD_LENGTH(maxWordLength), mParentDictionary(parentDictionary) {
BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength)
        : DICT(dict), MAX_WORD_LENGTH(maxWordLength) {
    if (DEBUG_DICT) {
        AKLOGI("BigramDictionary - constructor");
    }
@@ -38,7 +37,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
BigramDictionary::~BigramDictionary() {
}

bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) {
bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency,
        const int maxBigrams, int *bigramFreq, unsigned short *bigramChars) const {
    word[length] = 0;
    if (DEBUG_DICT) {
#ifdef FLAG_DBG
@@ -50,25 +50,25 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ

    // Find the right insertion point
    int insertAt = 0;
    while (insertAt < mMaxBigrams) {
        if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency
                && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) {
    while (insertAt < maxBigrams) {
        if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency
                && length < Dictionary::wideStrLen(bigramChars + insertAt * MAX_WORD_LENGTH))) {
            break;
        }
        insertAt++;
    }
    if (DEBUG_DICT) {
        AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
    }
    if (insertAt < mMaxBigrams) {
        memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
               (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
               (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
        mBigramFreq[insertAt] = frequency;
        memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
               (char*) mBigramChars + (insertAt    ) * MAX_WORD_LENGTH * sizeof(short),
               (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
        unsigned short *dest = mBigramChars + (insertAt    ) * MAX_WORD_LENGTH;
        AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, maxBigrams);
    }
    if (insertAt < maxBigrams) {
        memmove((char*) bigramFreq + (insertAt + 1) * sizeof(bigramFreq[0]),
               (char*) bigramFreq + insertAt * sizeof(bigramFreq[0]),
               (maxBigrams - insertAt - 1) * sizeof(bigramFreq[0]));
        bigramFreq[insertAt] = frequency;
        memmove((char*) bigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short),
               (char*) bigramChars + (insertAt    ) * MAX_WORD_LENGTH * sizeof(short),
               (maxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH);
        unsigned short *dest = bigramChars + (insertAt    ) * MAX_WORD_LENGTH;
        while (length--) {
            *dest++ = *word++;
        }
@@ -84,7 +84,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
/* Parameters :
 * prevWord: the word before, the one for which we need to look up bigrams.
 * prevWordLength: its length.
 * codes: what user typed, in the same format as for UnigramDictionary::getSuggestions.
 * inputCodes: what user typed, in the same format as for UnigramDictionary::getSuggestions.
 * codesSize: the size of the codes array.
 * bigramChars: an array for output, at the same format as outwords for getSuggestions.
 * bigramFreq: an array to output frequencies.
@@ -98,15 +98,11 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
 * and the bigrams are used to boost unigram result scores, it makes little sense to
 * reduce their scope to the ones that match the first letter.
 */
int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *codes,
int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, int *inputCodes,
        int codesSize, unsigned short *bigramChars, int *bigramFreq, int maxWordLength,
        int maxBigrams) {
        int maxBigrams) const {
    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
    mBigramFreq = bigramFreq;
    mBigramChars = bigramChars;
    mInputCodes = codes;
    mMaxBigrams = maxBigrams;

    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(prevWord, prevWordLength);
@@ -124,16 +120,17 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
                bigramBuffer, &unigramFreq);

        // codesSize == 0 means we are trying to find bigram predictions.
        if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) {
            const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
        if (codesSize < 1 || checkFirstCharacter(bigramBuffer, inputCodes)) {
            const int bigramFreqTemp = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags;
            // Due to space constraints, the frequency for bigrams is approximate - the lower the
            // unigram frequency, the worse the precision. The theoritical maximum error in
            // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4
            // in very bad cases. This means that sometimes, we'll see some bigrams interverted
            // here, but it can't get too bad.
            const int frequency =
                    BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq);
            if (addWordBigram(bigramBuffer, length, frequency)) {
                    BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp);
            if (addWordBigram(
                    bigramBuffer, length, frequency, maxBigrams, bigramFreq, bigramChars)) {
                ++bigramCount;
            }
        }
@@ -144,7 +141,7 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
// Returns a pointer to the start of the bigram list.
// If the word is not found or has no bigrams, this function returns 0.
int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
        const int prevWordLength) {
        const int prevWordLength) const {
    if (0 >= prevWordLength) return 0;
    const uint8_t* const root = DICT;
    int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength);
@@ -164,7 +161,7 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
}

void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(prevWord, prevWordLength);
@@ -181,11 +178,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
    } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
}

bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes) const {
    // Checks whether this word starts with same character or neighboring characters of
    // what user typed.

    int *inputCodes = mInputCodes;
    int maxAlt = MAX_ALTERNATIVES;
    const unsigned short firstBaseChar = toBaseLowerCase(*word);
    while (maxAlt > 0) {
@@ -199,7 +195,7 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word) {
}

bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
        int length2) {
        int length2) const {
    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(word1, length1);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
+9 −15
Original line number Diff line number Diff line
@@ -27,35 +27,29 @@ namespace latinime {
class Dictionary;
class BigramDictionary {
 public:
    BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary);
    int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams);
    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength);
    BigramDictionary(const unsigned char *dict, int maxWordLength);
    int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) const;
    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength) const;
    void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
            std::map<int, int> *map, uint8_t *filter);
    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2);
            std::map<int, int> *map, uint8_t *filter) const;
    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
    ~BigramDictionary();
 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);
    bool addWordBigram(unsigned short *word, int length, int frequency);
    bool addWordBigram(unsigned short *word, int length, int frequency, const int maxBigrams,
            int *bigramFreq, unsigned short *bigramChars) const;
    int getBigramAddress(int *pos, bool advance);
    int getBigramFreq(int *pos);
    void searchForTerminalNode(int addressLookingFor, int frequency);
    bool getFirstBitOfByte(int *pos) { return (DICT[*pos] & 0x80) > 0; }
    bool getSecondBitOfByte(int *pos) { return (DICT[*pos] & 0x40) > 0; }
    bool checkFirstCharacter(unsigned short *word);
    bool checkFirstCharacter(unsigned short *word, int *inputCodes) const;

    const unsigned char *DICT;
    const int MAX_WORD_LENGTH;
    // TODO: Re-implement proximity correction for bigram correction
    static const int MAX_ALTERNATIVES = 1;

    Dictionary *mParentDictionary;
    int *mBigramFreq;
    int mMaxBigrams;
    unsigned short *mBigramChars;
    int *mInputCodes;
    int mInputLength;
};

} // namespace latinime
+3 −6
Original line number Diff line number Diff line
@@ -38,27 +38,24 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
            AKLOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF));
        }
    }
    mWordsPriorityQueuePool = new WordsPriorityQueuePool(
            maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength);
    const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict);
    const unsigned int options = BinaryFormat::getFlags(mDict);
    mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier,
            fullWordMultiplier, maxWordLength, maxWords, options);
    mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, this);
    mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength);
}

Dictionary::~Dictionary() {
    delete mWordsPriorityQueuePool;
    delete mUnigramDictionary;
    delete mBigramDictionary;
}

int Dictionary::getFrequency(const int32_t *word, int length) {
int Dictionary::getFrequency(const int32_t *word, int length) const {
    return mUnigramDictionary->getFrequency(word, length);
}

bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
        int length2) {
        int length2) const {
    return mBigramDictionary->isValidBigram(word1, length1, word2, length2);
}

+10 −11
Original line number Diff line number Diff line
@@ -35,28 +35,28 @@ class Dictionary {

    int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
            int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength,
            bool useFullEditDistance, unsigned short *outWords, int *frequencies) {
            bool useFullEditDistance, unsigned short *outWords, int *frequencies) const {
        std::map<int, int> bigramMap;
        uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
        mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars,
                prevWordLength, &bigramMap, bigramFilter);
        return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
        return mUnigramDictionary->getSuggestions(proximityInfo,
                xcoordinates, ycoordinates, codes, codesSize, &bigramMap,
                bigramFilter, useFullEditDistance, outWords, frequencies);
    }

    int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) {
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) const {
        return mBigramDictionary->getBigrams(word, length, codes, codesSize, outWords, frequencies,
                maxWordLength, maxBigrams);
    }

    int getFrequency(const int32_t *word, int length);
    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2);
    void *getDict() { return (void *)mDict; }
    int getDictSize() { return mDictSize; }
    int getMmapFd() { return mMmapFd; }
    int getDictBufAdjust() { return mDictBufAdjust; }
    int getFrequency(const int32_t *word, int length) const;
    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
    void *getDict() const { return (void *)mDict; }
    int getDictSize() const { return mDictSize; }
    int getMmapFd() const { return mMmapFd; }
    int getDictBufAdjust() const { return mDictBufAdjust; }
    ~Dictionary();

    // public static utility methods
@@ -74,8 +74,7 @@ class Dictionary {
    const int mDictBufAdjust;

    const UnigramDictionary *mUnigramDictionary;
    BigramDictionary *mBigramDictionary;
    WordsPriorityQueuePool *mWordsPriorityQueuePool;
    const BigramDictionary *mBigramDictionary;
};

// public static utility methods
+9 −8
Original line number Diff line number Diff line
@@ -170,12 +170,13 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
// in bigram_dictionary.cpp
int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        WordsPriorityQueuePool *queuePool, const int *xcoordinates,
        const int *xcoordinates,
        const int *ycoordinates, const int *codes, const int codesSize,
        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, unsigned short *outWords, int *frequencies) const {

    queuePool->clearAll();
    WordsPriorityQueuePool queuePool(MAX_WORDS, SUB_QUEUE_MAX_WORDS, MAX_WORD_LENGTH);
    queuePool.clearAll();
    Correction masterCorrection;
    masterCorrection.resetCorrection();
    if (BinaryFormat::REQUIRES_GERMAN_UMLAUT_PROCESSING & FLAGS)
@@ -186,7 +187,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection,
                queuePool, GERMAN_UMLAUT_DIGRAPHS,
                &queuePool, GERMAN_UMLAUT_DIGRAPHS,
                sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]));
    } else if (BinaryFormat::REQUIRES_FRENCH_LIGATURES_PROCESSING & FLAGS) {
        int codesBuffer[getCodesBufferSize(codes, codesSize)];
@@ -195,26 +196,26 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                useFullEditDistance, codes, codesSize, 0, codesBuffer, &masterCorrection,
                queuePool, FRENCH_LIGATURES_DIGRAPHS,
                &queuePool, FRENCH_LIGATURES_DIGRAPHS,
                sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0]));
    } else { // Normal processing
        getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
                bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, queuePool);
                bigramMap, bigramFilter, useFullEditDistance, &masterCorrection, &queuePool);
    }

    PROF_START(20);
    if (DEBUG_DICT) {
        float ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
        float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
                masterCorrection.getPrimaryInputWord(), codesSize, 0, 0, 0);
        ns += 0;
        AKLOGI("Max normalized score = %f", ns);
    }
    const int suggestedWordsCount =
            queuePool->getMasterQueue()->outputSuggestions(
            queuePool.getMasterQueue()->outputSuggestions(
                    masterCorrection.getPrimaryInputWord(), codesSize, frequencies, outWords);

    if (DEBUG_DICT) {
        float ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
        float ns = queuePool.getMasterQueue()->getHighestNormalizedScore(
                masterCorrection.getPrimaryInputWord(), codesSize, 0, 0, 0);
        ns += 0;
        AKLOGI("Returning %d words", suggestedWordsCount);
Loading