Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1ff81e88 authored by Keisuke Kuroynagi's avatar Keisuke Kuroynagi
Browse files

Use bloom filter in multi bigram map.

Evaluated with previous word "this".
without bloom filter (use only hash_map):
Total 147792.34 (sum of others 147771.57)
with bloom filter:
Total 145900.64 (sum of others 145874.30)
always read binary dictionary:
Total 148603.14 (sum of others 148579.90)

Bug: 8592527
Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28
parent 4f191935
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -57,9 +57,11 @@ LATIN_IME_CORE_SRC_FILES := \
        binary_dictionary_format_utils.cpp \
        binary_dictionary_header.cpp \
        binary_dictionary_header_reading_utils.cpp \
        bloom_filter.cpp \
        byte_array_utils.cpp \
        dictionary.cpp \
        digraph_utils.cpp) \
        digraph_utils.cpp \
        multi_bigram_map.cpp) \
    $(addprefix suggest/core/layout/, \
        additional_proximity_chars.cpp \
        proximity_info.cpp \
+0 −27
Original line number Diff line number Diff line
@@ -300,33 +300,6 @@ static inline void prof_out(void) {
#define DIC_NODES_CACHE_INITIAL_QUEUE_ID_CACHE_FOR_CONTINUOUS_SUGGESTION 3
#define DIC_NODES_CACHE_PRIORITY_QUEUES_SIZE 4

// Size, in bytes, of the bloom filter index for bigrams
// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
// where k is the number of hash functions, n the number of bigrams, and m the number of
// bits we can test.
// At the moment 100 is the maximum number of bigrams for a word with the current
// dictionaries, so n = 100. 1024 buckets give us m = 1024.
// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
// our uses since we are only using this to increase average performance. For the record,
// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
// and m = 4096 gives 2.4%.
#define BIGRAM_FILTER_BYTE_SIZE 128
// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
// prime under 128 * 8.
#define BIGRAM_FILTER_MODULO 1021
#if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO
#error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE"
#endif

// Max number of bigram maps (previous word contexts) to be cached. Increasing this number could
// improve bigram lookup speed for multi-word suggestions, but at the cost of more memory usage.
// Also, there are diminishing returns since the most frequently used bigrams are typically near
// the beginning of the input and are thus the first ones to be cached. Note that these bigrams
// are reset for each new composing word.
#define MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP 25
// Most common previous word contexts currently have 100 bigrams
#define DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP 100

template<typename T> AK_FORCE_INLINE const T &min(const T &a, const T &b) { return a < b ? a : b; }
template<typename T> AK_FORCE_INLINE const T &max(const T &a, const T &b) { return a > b ? a : b; }

+25 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "suggest/core/dictionary/bloom_filter.h"

namespace latinime {

// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
// prime under 128 * 8.
const int BloomFilter::BIGRAM_FILTER_MODULO = 1021;

} // namespace latinime
+43 −11
Original line number Diff line number Diff line
@@ -23,16 +23,48 @@

namespace latinime {

// This bloom filter is used for optimizing bigram retrieval.
// Execution times with previous word "this" are as follows:
//  without bloom filter (use only hash_map):
//   Total 147792.34 (sum of others 147771.57)
//  with bloom filter:
//   Total 145900.64 (sum of others 145874.30)
//  always read binary dictionary:
//   Total 148603.14 (sum of others 148579.90)
class BloomFilter {
 public:
    BloomFilter() {
        ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
    }

    // TODO: uint32_t position
static inline void setInFilter(uint8_t *filter, const int32_t position) {
    AK_FORCE_INLINE void setInFilter(const int32_t position) {
        const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
    filter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
        mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
    }

    // TODO: uint32_t position
static inline bool isInFilter(const uint8_t *filter, const int32_t position) {
    AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
        const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
    return filter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7));
        return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
    }

 private:
    // Size, in bytes, of the bloom filter index for bigrams
    // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
    // where k is the number of hash functions, n the number of bigrams, and m the number of
    // bits we can test.
    // At the moment 100 is the maximum number of bigrams for a word with the current
    // dictionaries, so n = 100. 1024 buckets give us m = 1024.
    // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
    // our uses since we are only using this to increase average performance. For the record,
    // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
    // and m = 4096 gives 2.4%.
    // This is assigned here because it is used for array size.
    static const int BIGRAM_FILTER_BYTE_SIZE = 128;
    static const int BIGRAM_FILTER_MODULO;

    uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
};
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H
+33 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "suggest/core/dictionary/multi_bigram_map.h"

#include <cstddef>

namespace latinime {

// Max number of bigram maps (previous word contexts) to be cached. Increasing this number
// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory
// usage. Also, there are diminishing returns since the most frequently used bigrams are
// typically near the beginning of the input and are thus the first ones to be cached. Note
// that these bigrams are reset for each new composing word.
const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;

// Most common previous word contexts currently have 100 bigrams
const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;

} // namespace latinime
Loading