Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8950ce6c authored by Jean Chalard's avatar Jean Chalard
Browse files

Replace the bigram list position with the map and filter

Passing the position will not allow us a reasonable lookup
time. Replace this with a map and bloom filter for very fast
lookup.

Bug: 6313806
Change-Id: I3a61c0001cbc987c1c3c7b8df635d4590a370144
parent f1634c87
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -158,6 +158,11 @@ static inline void setInFilter(uint8_t *filter, const int position) {
    filter[bucket >> 3] |= (1 << (bucket & 0x7));
}

static inline bool isInFilter(uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    return filter[bucket >> 3] & (1 << (bucket & 0x7));
}

void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
+7 −4
Original line number Diff line number Diff line
@@ -66,7 +66,8 @@ class BinaryFormat {
            const int length);
    static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
            uint16_t* outWord);
    static int getProbability(const int bigramListPosition, const int unigramFreq);
    static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
            const int unigramFreq);

    // Flags for special processing
    // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -519,9 +520,11 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
}

// This should probably return a probability in log space.
inline int BinaryFormat::getProbability(const int bigramListPosition, const int unigramFreq) {
    // TODO: use the bigram list position to get the bigram probability. If the bigram
    // is not found, use the unigram frequency.
inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap,
        const uint8_t *bigramFilter, const int unigramFreq) {
    // TODO: use the bigram filter for fast rejection, then the bigram map for lookup
    // to get the bigram probability. If the bigram is not found, use the unigram frequency.
    // Don't forget that they can be null.
    // TODO: if the unigram frequency is used, compute the actual probability
    return unigramFreq;
}
+2 −6
Original line number Diff line number Diff line
@@ -37,17 +37,13 @@ class Dictionary {
    int getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates, int *ycoordinates,
            int *codes, int codesSize, const int32_t* prevWordChars, const int prevWordLength,
            bool useFullEditDistance, unsigned short *outWords, int *frequencies) {
        // bigramListPosition is, as an int, the offset of the bigram list in the file.
        // If none, it's zero.
        const int bigramListPosition = !prevWordChars ? 0
                : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength);
        std::map<int, int> bigramMap;
        uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
        mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars,
                prevWordLength, &bigramMap, bigramFilter);
        return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
                mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition,
                useFullEditDistance, outWords, frequencies);
                mCorrection, xcoordinates, ycoordinates, codes, codesSize, &bigramMap,
                bigramFilter, useFullEditDistance, outWords, frequencies);
    }

    int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
+28 −25
Original line number Diff line number Diff line
@@ -98,7 +98,7 @@ int UnigramDictionary::getDigraphReplacement(const int *codes, const int i, cons
void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codesBuffer,
        int *xCoordinatesBuffer, int *yCoordinatesBuffer,
        const int codesBufferSize, const int bigramListPosition,
        const int codesBufferSize, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, const int *codesSrc,
        const int codesRemain, const int currentDepth, int *codesDest, Correction *correction,
        WordsPriorityQueuePool *queuePool,
@@ -128,7 +128,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
                        replacementCodePoint;
                getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
                        codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
                        bigramListPosition, useFullEditDistance, codesSrc + i + 1,
                        bigramMap, bigramFilter, useFullEditDistance, codesSrc + i + 1,
                        codesRemain - i - 1, currentDepth + 1, codesDest + i, correction,
                        queuePool, digraphs, digraphsSize);

@@ -138,7 +138,7 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
                memcpy(codesDest + i, codesSrc + i, BYTES_IN_ONE_CHAR);
                getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates,
                        codesBuffer, xCoordinatesBuffer, yCoordinatesBuffer, codesBufferSize,
                        bigramListPosition, useFullEditDistance, codesSrc + i, codesRemain - i,
                        bigramMap, bigramFilter, useFullEditDistance, codesSrc + i, codesRemain - i,
                        currentDepth + 1, codesDest + i, correction, queuePool, digraphs,
                        digraphsSize);
                return;
@@ -161,16 +161,18 @@ void UnigramDictionary::getWordWithDigraphSuggestionsRec(ProximityInfo *proximit
    }

    getWordSuggestions(proximityInfo, xCoordinatesBuffer, yCoordinatesBuffer, codesBuffer,
            startIndex + codesRemain, bigramListPosition, useFullEditDistance, correction,
            startIndex + codesRemain, bigramMap, bigramFilter, useFullEditDistance, correction,
            queuePool);
}

// bigramListPosition is the offset in the file to the list of bigrams for the previous word.
// bigramMap contains the association <bigram address> -> <bigram frequency>
// bigramFilter is a bloom filter for fast rejection: see functions setInFilter and isInFilter
// in bigram_dictionary.cpp
int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates,
        const int *ycoordinates, const int *codes, const int codesSize,
        const int bigramListPosition, const bool useFullEditDistance, unsigned short *outWords,
        int *frequencies) {
        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, unsigned short *outWords, int *frequencies) {

    queuePool->clearAll();
    Correction* masterCorrection = correction;
@@ -180,7 +182,7 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        int xCoordinatesBuffer[codesSize];
        int yCoordinatesBuffer[codesSize];
        getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection,
                queuePool, GERMAN_UMLAUT_DIGRAPHS,
                sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]));
@@ -189,13 +191,13 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
        int xCoordinatesBuffer[codesSize];
        int yCoordinatesBuffer[codesSize];
        getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramListPosition,
                xCoordinatesBuffer, yCoordinatesBuffer, codesSize, bigramMap, bigramFilter,
                useFullEditDistance, codes, codesSize, 0, codesBuffer, masterCorrection,
                queuePool, FRENCH_LIGATURES_DIGRAPHS,
                sizeof(FRENCH_LIGATURES_DIGRAPHS) / sizeof(FRENCH_LIGATURES_DIGRAPHS[0]));
    } else { // Normal processing
        getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
                bigramListPosition, useFullEditDistance, masterCorrection, queuePool);
                bigramMap, bigramFilter, useFullEditDistance, masterCorrection, queuePool);
    }

    PROF_START(20);
@@ -228,15 +230,15 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,

void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const int inputLength, const int bigramListPosition, const bool useFullEditDistance,
        Correction *correction, WordsPriorityQueuePool *queuePool) {
        const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) {

    PROF_OPEN;
    PROF_START(0);
    PROF_END(0);

    PROF_START(1);
    getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramListPosition,
    getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter,
            useFullEditDistance, inputLength, correction, queuePool);
    PROF_END(1);

@@ -308,15 +310,16 @@ static const char SPACE = ' ';

void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const int bigramListPosition, const bool useFullEditDistance, const int inputLength,
        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, const int inputLength,
        Correction *correction, WordsPriorityQueuePool *queuePool) {
    initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
    getSuggestionCandidates(useFullEditDistance, inputLength, bigramListPosition, correction,
    getSuggestionCandidates(useFullEditDistance, inputLength, bigramMap, bigramFilter, correction,
            queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
}

void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
        const int inputLength, const int bigramListPosition,
        const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        Correction *correction, WordsPriorityQueuePool *queuePool,
        const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) {
    // TODO: Remove setCorrectionParams
@@ -337,7 +340,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
            int firstChildPos;

            const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
                    bigramListPosition, correction, &childCount, &firstChildPos, &siblingPos,
                    bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos,
                    queuePool, currentWordIndex);
            // Update next sibling pos
            correction->setTreeSiblingPos(outputIndex, siblingPos);
@@ -432,8 +435,8 @@ bool UnigramDictionary::getSubStringSuggestion(
            queuePool->clearSubQueue(currentWordIndex);
            // TODO: pass the bigram list for substring suggestion
            getSuggestionCandidates(useFullEditDistance, inputWordLength,
                    0 /* bigramListPosition */, correction, queuePool, false /* doAutoCompletion */,
                    MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
                    0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool,
                    false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
            if (DEBUG_DICT) {
                if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
                    AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
@@ -763,9 +766,9 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs
// the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any
// given level, as output into newCount when traversing this level's parent.
inline bool UnigramDictionary::processCurrentNode(const int initialPos,
        const int bigramListPosition, Correction *correction, int *newCount,
        int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
        const int currentWordIndex) {
        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction,
        int *newCount, int *newChildrenPosition, int *nextSiblingPosition,
        WordsPriorityQueuePool *queuePool, const int currentWordIndex) {
    if (DEBUG_DICT) {
        correction->checkState();
    }
@@ -846,9 +849,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
        const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
        const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
        TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
        // The bigramListPosition is the offset in the file of the bigrams for the previous word,
        // or zero if we don't know of any bigrams for it.
        const int probability = BinaryFormat::getProbability(bigramListPosition, unigramFreq);
        // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
        // bigramFilter is a bloom filter of said frequencies for even faster rejection.
        const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq);
        onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
                currentWordIndex);

+18 −12
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H

#include <map>
#include <stdint.h>
#include "correction.h"
#include "correction_state.h"
@@ -75,32 +76,36 @@ class UnigramDictionary {
    int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
    int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
            Correction *correction, const int *xcoordinates, const int *ycoordinates,
            const int *codes, const int codesSize, const int bigramListPosition,
            const bool useFullEditDistance, unsigned short *outWords, int *frequencies);
            const int *codes, const int codesSize, const std::map<int, int> *bigramMap,
            const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords,
            int *frequencies);
    virtual ~UnigramDictionary();

 private:
    void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int inputLength,
            const int bigramListPosition, const bool useFullEditDistance, Correction *correction,
            const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
            const bool useFullEditDistance, Correction *correction,
            WordsPriorityQueuePool *queuePool);
    int getDigraphReplacement(const int *codes, const int i, const int codesSize,
            const digraph_t* const digraphs, const unsigned int digraphsSize) const;
    void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
        int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize,
        const int bigramListPosition, const bool useFullEditDistance, const int* codesSrc,
        const int codesRemain, const int currentDepth, int* codesDest, Correction *correction,
        const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
        const bool useFullEditDistance, const int* codesSrc, const int codesRemain,
        const int currentDepth, int* codesDest, Correction *correction,
        WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs,
        const unsigned int digraphsSize);
    void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize, Correction *correction);
    void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int bigramListPosition,
            const bool useFullEditDistance, const int inputLength, Correction *correction,
            WordsPriorityQueuePool* queuePool);
            const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap,
            const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength,
            Correction *correction, WordsPriorityQueuePool* queuePool);
    void getSuggestionCandidates(
            const bool useFullEditDistance, const int inputLength, const int bigramListPosition,
            const bool useFullEditDistance, const int inputLength,
            const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
            Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion,
            const int maxErrors, const int currentWordIndex);
    void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
@@ -114,9 +119,10 @@ class UnigramDictionary {
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int initialPos, const int bigramListPosition,
            Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition,
            WordsPriorityQueuePool *queuePool, const int currentWordIndex);
    bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap,
            const uint8_t *bigramFilter, Correction *correction, int *newCount,
            int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
            const int currentWordIndex);
    int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
            ProximityInfo *proximityInfo, unsigned short *word);
    int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,