Loading native/jni/src/bigram_dictionary.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -25,6 +25,7 @@ #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/probability_utils.h" namespace latinime { Loading Loading @@ -134,7 +135,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int probability = BinaryFormat::computeProbabilityForBigram( const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); Loading native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state.h" Loading Loading @@ -211,7 +212,7 @@ namespace latinime { const int prevWordPos = node->getPrevWordPos(); if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) { // Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD. return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { return multiBigramMap->getBigramProbability( Loading native/jni/src/suggest/core/dictionary/binary_format.h +12 −46 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ #define LATINIME_BINARY_FORMAT_H #include <cstdlib> #include <map> #include <stdint.h> #include "hash_map_compat.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/probability_utils.h" namespace latinime { Loading Loading @@ -91,10 +91,6 @@ class BinaryFormat { const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramProbability); static int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramProbability); static int getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability); static float getMultiWordCostMultiplier(const uint8_t *const dict, const int dictSize); Loading Loading @@ -678,51 +674,18 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } static inline int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } inline int BinaryFormat::computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the // unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) return backoff(unigramProbability); if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } // This returns a probability in log space. inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability) { if (!bigramMap) return backoff(unigramProbability); if (!bigramMap) { return ProbabilityUtils::backoff(unigramProbability); } const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( Loading @@ -743,7 +706,9 @@ AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position, const int nextPosition, const int unigramProbability) { position = getBigramListPositionForWordPosition(root, position); if (0 == position) return backoff(unigramProbability); if (0 == position) { return ProbabilityUtils::backoff(unigramProbability); } uint8_t bigramFlags; do { Loading @@ -752,10 +717,11 @@ AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root root, bigramFlags, &position); if (bigramPos == nextPosition) { const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; return computeProbabilityForBigram(unigramProbability, bigramProbability); return ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbability); } } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } // Returns a pointer to the start of the bigram list. Loading native/jni/src/suggest/core/dictionary/probability_utils.h 0 → 100644 +74 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_PROBABILITY_UTILS_H #define LATINIME_PROBABILITY_UTILS_H #include <map> #include <stdint.h> #include "defines.h" namespace latinime { class ProbabilityUtils { public: static AK_FORCE_INLINE int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramProbability > 8 ? // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } static AK_FORCE_INLINE int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want // the unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. static AK_FORCE_INLINE int getProbability(const int position, const std::map<int, int> *const bigramMap, const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) { return backoff(unigramProbability); } if (!isInFilter(bigramFilter, position)){ return backoff(unigramProbability); } const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } private: DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); }; } #endif /* LATINIME_PROBABILITY_UTILS_H */ native/jni/src/unigram_dictionary.cpp +3 −2 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" #include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" #include "unigram_dictionary.h" Loading Loading @@ -935,8 +936,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, unigramProbability); const int probability = ProbabilityUtils::getProbability( initialPos, bigramMap, bigramFilter, unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); Loading Loading
native/jni/src/bigram_dictionary.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -25,6 +25,7 @@ #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/probability_utils.h" namespace latinime { Loading Loading @@ -134,7 +135,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int probability = BinaryFormat::computeProbabilityForBigram( const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); Loading
native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +2 −1 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state.h" Loading Loading @@ -211,7 +212,7 @@ namespace latinime { const int prevWordPos = node->getPrevWordPos(); if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) { // Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD. return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { return multiBigramMap->getBigramProbability( Loading
native/jni/src/suggest/core/dictionary/binary_format.h +12 −46 Original line number Diff line number Diff line Loading @@ -18,12 +18,12 @@ #define LATINIME_BINARY_FORMAT_H #include <cstdlib> #include <map> #include <stdint.h> #include "hash_map_compat.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/probability_utils.h" namespace latinime { Loading Loading @@ -91,10 +91,6 @@ class BinaryFormat { const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramProbability); static int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramProbability); static int getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability); static float getMultiWordCostMultiplier(const uint8_t *const dict, const int dictSize); Loading Loading @@ -678,51 +674,18 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } static inline int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } inline int BinaryFormat::computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the // unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) return backoff(unigramProbability); if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } // This returns a probability in log space. inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position, const hash_map_compat<int, int> *bigramMap, const int unigramProbability) { if (!bigramMap) return backoff(unigramProbability); if (!bigramMap) { return ProbabilityUtils::backoff(unigramProbability); } const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( Loading @@ -743,7 +706,9 @@ AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position, const int nextPosition, const int unigramProbability) { position = getBigramListPositionForWordPosition(root, position); if (0 == position) return backoff(unigramProbability); if (0 == position) { return ProbabilityUtils::backoff(unigramProbability); } uint8_t bigramFlags; do { Loading @@ -752,10 +717,11 @@ AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root root, bigramFlags, &position); if (bigramPos == nextPosition) { const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; return computeProbabilityForBigram(unigramProbability, bigramProbability); return ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbability); } } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); return backoff(unigramProbability); return ProbabilityUtils::backoff(unigramProbability); } // Returns a pointer to the start of the bigram list. Loading
native/jni/src/suggest/core/dictionary/probability_utils.h 0 → 100644 +74 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_PROBABILITY_UTILS_H #define LATINIME_PROBABILITY_UTILS_H #include <map> #include <stdint.h> #include "defines.h" namespace latinime { class ProbabilityUtils { public: static AK_FORCE_INLINE int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramProbability > 8 ? // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } static AK_FORCE_INLINE int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want // the unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. static AK_FORCE_INLINE int getProbability(const int position, const std::map<int, int> *const bigramMap, const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) { return backoff(unigramProbability); } if (!isInFilter(bigramFilter, position)){ return backoff(unigramProbability); } const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } private: DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); }; } #endif /* LATINIME_PROBABILITY_UTILS_H */
native/jni/src/unigram_dictionary.cpp +3 −2 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" #include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" #include "unigram_dictionary.h" Loading Loading @@ -935,8 +936,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, unigramProbability); const int probability = ProbabilityUtils::getProbability( initialPos, bigramMap, bigramFilter, unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); Loading