Loading native/jni/src/bigram_dictionary.cpp +9 −2 Original line number Diff line number Diff line Loading @@ -153,8 +153,14 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, return pos; } void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map) { static inline void setInFilter(uint8_t *filter, const int position) { const unsigned int bucket = position % BIGRAM_FILTER_MODULO; filter[bucket >> 3] |= (1 << (bucket & 0x7)); } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t* const root = DICT; int pos = getBigramListPositionForWord(prevWord, prevWordLength); if (0 == pos) return; Loading @@ -166,6 +172,7 @@ void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; setInFilter(filter, bigramPos); } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } Loading native/jni/src/bigram_dictionary.h +4 −2 Original line number Diff line number Diff line Loading @@ -20,6 +20,8 @@ #include <map> #include <stdint.h> #include "defines.h" namespace latinime { class Dictionary; Loading @@ -29,8 +31,8 @@ class BigramDictionary { int getBigrams(const int32_t *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength); void fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map); void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter); ~BigramDictionary(); private: bool addWordBigram(unsigned short *word, int length, int frequency); Loading native/jni/src/defines.h +18 −0 Original line number Diff line number Diff line Loading @@ -241,6 +241,24 @@ static inline void prof_out(void) { #define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 // Size, in bytes, of the bloom filter index for bigrams // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, // where k is the number of hash functions, n the number of bigrams, and m the number of // bits we can test. // At the moment 100 is the maximum number of bigrams for a word with the current // dictionaries, so n = 100. 1024 buckets give us m = 1024. // With 1 hash function, our false positive rate is about 9.3%, which should be enough for // our uses since we are only using this to increase average performance. For the record, // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, // and m = 4096 gives 2.4%. #define BIGRAM_FILTER_BYTE_SIZE 128 // Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest // prime under 128 * 8. #define BIGRAM_FILTER_MODULO 1021 #if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO #error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE" #endif template<typename T> inline T min(T a, T b) { return a < b ? a : b; } template<typename T> inline T max(T a, T b) { return a > b ? a : b; } Loading native/jni/src/dictionary.h +3 −2 Original line number Diff line number Diff line Loading @@ -42,8 +42,9 @@ class Dictionary { const int bigramListPosition = !prevWordChars ? 0 : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength); std::map<int, int> bigramMap; mBigramDictionary->fillBigramAddressToFrequencyMap(prevWordChars, prevWordLength, &bigramMap); uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, prevWordLength, &bigramMap, bigramFilter); return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition, useFullEditDistance, outWords, frequencies); Loading Loading
native/jni/src/bigram_dictionary.cpp +9 −2 Original line number Diff line number Diff line Loading @@ -153,8 +153,14 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, return pos; } void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map) { static inline void setInFilter(uint8_t *filter, const int position) { const unsigned int bucket = position % BIGRAM_FILTER_MODULO; filter[bucket >> 3] |= (1 << (bucket & 0x7)); } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t* const root = DICT; int pos = getBigramListPositionForWord(prevWord, prevWordLength); if (0 == pos) return; Loading @@ -166,6 +172,7 @@ void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; setInFilter(filter, bigramPos); } while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } Loading
native/jni/src/bigram_dictionary.h +4 −2 Original line number Diff line number Diff line Loading @@ -20,6 +20,8 @@ #include <map> #include <stdint.h> #include "defines.h" namespace latinime { class Dictionary; Loading @@ -29,8 +31,8 @@ class BigramDictionary { int getBigrams(const int32_t *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams); int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength); void fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map); void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter); ~BigramDictionary(); private: bool addWordBigram(unsigned short *word, int length, int frequency); Loading
native/jni/src/defines.h +18 −0 Original line number Diff line number Diff line Loading @@ -241,6 +241,24 @@ static inline void prof_out(void) { #define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 // Size, in bytes, of the bloom filter index for bigrams // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, // where k is the number of hash functions, n the number of bigrams, and m the number of // bits we can test. // At the moment 100 is the maximum number of bigrams for a word with the current // dictionaries, so n = 100. 1024 buckets give us m = 1024. // With 1 hash function, our false positive rate is about 9.3%, which should be enough for // our uses since we are only using this to increase average performance. For the record, // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, // and m = 4096 gives 2.4%. #define BIGRAM_FILTER_BYTE_SIZE 128 // Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest // prime under 128 * 8. #define BIGRAM_FILTER_MODULO 1021 #if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO #error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE" #endif template<typename T> inline T min(T a, T b) { return a < b ? a : b; } template<typename T> inline T max(T a, T b) { return a > b ? a : b; } Loading
native/jni/src/dictionary.h +3 −2 Original line number Diff line number Diff line Loading @@ -42,8 +42,9 @@ class Dictionary { const int bigramListPosition = !prevWordChars ? 0 : mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength); std::map<int, int> bigramMap; mBigramDictionary->fillBigramAddressToFrequencyMap(prevWordChars, prevWordLength, &bigramMap); uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE]; mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars, prevWordLength, &bigramMap, bigramFilter); return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool, mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition, useFullEditDistance, outWords, frequencies); Loading