Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6cb23a49 authored by Jean Chalard's avatar Jean Chalard Committed by Android (Google) Code Review
Browse files

Merge "Perform the actual bigram frequency lookup." into jb-dev

parents 0462f4dc 49ba135f
Loading
Loading
Loading
Loading
+2 −11
Original line number Diff line number Diff line
@@ -20,8 +20,9 @@
#define LOG_TAG "LatinIME: bigram_dictionary.cpp"

#include "bigram_dictionary.h"
#include "dictionary.h"
#include "binary_format.h"
#include "bloom_filter.h"
#include "dictionary.h"

namespace latinime {

@@ -153,16 +154,6 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
    return pos;
}

static inline void setInFilter(uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    filter[bucket >> 3] |= (1 << (bucket & 0x7));
}

static inline bool isInFilter(uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    return filter[bucket >> 3] & (1 << (bucket & 0x7));
}

void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
+13 −7
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#define LATINIME_BINARY_FORMAT_H

#include <limits>
#include "bloom_filter.h"
#include "unigram_dictionary.h"

namespace latinime {
@@ -66,8 +67,8 @@ class BinaryFormat {
            const int length);
    static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
            uint16_t* outWord);
    static int getProbability(const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
            const int unigramFreq);
    static int getProbability(const int position, const std::map<int, int> *bigramMap,
            const uint8_t *bigramFilter, const int unigramFreq);

    // Flags for special processing
    // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -520,14 +521,19 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
}

// This should probably return a probability in log space.
inline int BinaryFormat::getProbability(const std::map<int, int> *bigramMap,
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
        const uint8_t *bigramFilter, const int unigramFreq) {
    // TODO: use the bigram filter for fast rejection, then the bigram map for lookup
    // to get the bigram probability. If the bigram is not found, use the unigram frequency.
    // Don't forget that they can be null.
    // TODO: if the unigram frequency is used, compute the actual probability
    if (!bigramMap || !bigramFilter) return unigramFreq;
    if (!isInFilter(bigramFilter, position)) return unigramFreq;
    const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
    if (bigramFreq != bigramMap->end()) {
        // TODO: return the frequency in bigramFreq->second
        return unigramFreq;
    } else {
        return unigramFreq;
    }
    // TODO: if the unigram frequency is used, compute the actual probability
}

} // namespace latinime

+38 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_BLOOM_FILTER_H
#define LATINIME_BLOOM_FILTER_H

#include <stdint.h>

#include "defines.h"

namespace latinime {

static inline void setInFilter(uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    filter[bucket >> 3] |= (1 << (bucket & 0x7));
}

static inline bool isInFilter(const uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    return filter[bucket >> 3] & (1 << (bucket & 0x7));
}

} // namespace latinime

#endif // LATINIME_BLOOM_FILTER_H
+2 −1
Original line number Diff line number Diff line
@@ -851,7 +851,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
        TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
        // bigramMap contains the bigram frequencies indexed by addresses for fast lookup.
        // bigramFilter is a bloom filter of said frequencies for even faster rejection.
        const int probability = BinaryFormat::getProbability(bigramMap, bigramFilter, unigramFreq);
        const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter,
                unigramFreq);
        onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
                currentWordIndex);