Loading native/jni/Android.mk +2 −0 Original line number Diff line number Diff line Loading @@ -58,6 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \ dic_nodes_cache.cpp) \ $(addprefix suggest/core/dictionary/, \ char_utils.cpp \ binary_dictionary_format.cpp \ byte_array_utils.cpp \ dictionary.cpp \ digraph_utils.cpp) \ $(addprefix suggest/core/layout/, \ Loading native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +5 −4 Original line number Diff line number Diff line Loading @@ -35,7 +35,8 @@ #include "jni.h" #include "jni_common.h" #include "suggest_options.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/binary_dictionary_format.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" namespace latinime { Loading Loading @@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s return 0; } Dictionary *dictionary = 0; if (BinaryFormat::UNKNOWN_FORMAT == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf), if (BinaryDictionaryFormat::UNKNOWN_VERSION == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf), static_cast<int>(dictSize))) { AKLOGE("DICT: dictionary format is unknown, bad magic number"); #ifdef USE_MMAP_FOR_DICTIONARY Loading Loading @@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; const void *dictBuf = dictionary->getDict(); const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf(); if (!dictBuf) return; #ifdef USE_MMAP_FOR_DICTIONARY releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(), Loading native/jni/src/bigram_dictionary.cpp +7 −5 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ #include "bigram_dictionary.h" #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" Loading @@ -28,7 +29,8 @@ namespace latinime { BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo) : mBinaryDictionaryInfo(binaryDictionaryInfo) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } Loading Loading @@ -103,7 +105,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams Loading Loading @@ -149,7 +151,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, forceLowerCaseSearch); Loading @@ -170,7 +172,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (0 == pos) { Loading Loading @@ -209,7 +211,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; Loading native/jni/src/bigram_dictionary.h +6 −2 Original line number Diff line number Diff line Loading @@ -24,9 +24,12 @@ namespace latinime { class BinaryDictionaryInfo; class BigramDictionary { public: BigramDictionary(const uint8_t *const streamStart); BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, Loading @@ -35,13 +38,14 @@ class BigramDictionary { ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); void addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; const uint8_t *const DICT_ROOT; const BinaryDictionaryInfo *const mBinaryDictionaryInfo; // TODO: Re-implement proximity correction for bigram correction static const int MAX_ALTERNATIVES = 1; }; Loading native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +54 −38 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_utils.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/multi_bigram_map.h" Loading @@ -32,20 +33,23 @@ namespace latinime { // Node initialization utils // /////////////////////////////// /* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot, const int prevWordNodePos, DicNode *newRootNode) { int curPos = rootPos; /* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int prevWordNodePos, DicNode *const newRootNode) { int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) { int curPos = rootPos; /*static */ void DicNodeUtils::initAsRootWithPreviousWord( const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNode *const prevWordLastNode, DicNode *const newRootNode) { int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); } Loading @@ -71,16 +75,19 @@ namespace latinime { } /* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { int nextPos = pos; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); int codePoint = BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != codePoint); const int nodeCodePoint = codePoint; // TODO: optimize this Loading @@ -90,7 +97,8 @@ namespace latinime { do { const int nextCodePoint = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT; ? BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); if (!isLastChar) { additionalWordBuf[additionalSubwordLength++] = nextCodePoint; Loading @@ -98,12 +106,14 @@ namespace latinime { codePoint = nextCodePoint; } while (NOT_A_CODE_POINT != codePoint); const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1; const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer( binaryDictionaryInfo->getDictRoot(), pos) : -1; pos = BinaryFormat::skipProbability(flags, pos); int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0; int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition( binaryDictionaryInfo->getDictRoot(), flags, pos) : 0; const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos); const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes( binaryDictionaryInfo->getDictRoot(), flags, pos); if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { return siblingPos; Loading @@ -111,8 +121,8 @@ namespace latinime { if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { return siblingPos; } const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0; const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0; childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); Loading Loading @@ -148,16 +158,18 @@ namespace latinime { } /* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const int terminalDepth = dicNode->getLeavingDepth(); const int childCount = dicNode->getChildrenCount(); int nextPos = dicNode->getChildrenPos(); for (int i = 0; i < childCount; i++) { const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { // All code points have been found. break; Loading @@ -165,14 +177,15 @@ namespace latinime { } } /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, DicNodeVector *childDicNodes) { getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes); /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) { getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes); } /* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, DicNodeVector *childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { return; } Loading @@ -180,9 +193,9 @@ namespace latinime { DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly, childDicNodes); } else { DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex, exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes); DicNodeUtils::createAndGetAllLeavingChildNodes( dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes); } } Loading @@ -192,19 +205,21 @@ namespace latinime { /** * Computes the combined bigram / unigram cost for the given dicNode. */ /* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot, /* static */ float DicNodeUtils::getBigramNodeImprobability( const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { if (node->isImpossibleBigramWord()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap); const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. const float cost = static_cast<float>(MAX_PROBABILITY - probability) / static_cast<float>(MAX_PROBABILITY); return cost; } /* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot, /* static */ int DicNodeUtils::getBigramNodeProbability( const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { const int unigramProbability = node->getProbability(); const int wordPos = node->getPos(); Loading @@ -215,9 +230,10 @@ namespace latinime { } if (multiBigramMap) { return multiBigramMap->getBigramProbability( dicRoot, prevWordPos, wordPos, unigramProbability); binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); } return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability); return BinaryFormat::getBigramProbability( binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability); } /////////////////////////////////////// Loading Loading
native/jni/Android.mk +2 −0 Original line number Diff line number Diff line Loading @@ -58,6 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \ dic_nodes_cache.cpp) \ $(addprefix suggest/core/dictionary/, \ char_utils.cpp \ binary_dictionary_format.cpp \ byte_array_utils.cpp \ dictionary.cpp \ digraph_utils.cpp) \ $(addprefix suggest/core/layout/, \ Loading
native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +5 −4 Original line number Diff line number Diff line Loading @@ -35,7 +35,8 @@ #include "jni.h" #include "jni_common.h" #include "suggest_options.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/binary_dictionary_format.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" namespace latinime { Loading Loading @@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s return 0; } Dictionary *dictionary = 0; if (BinaryFormat::UNKNOWN_FORMAT == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf), if (BinaryDictionaryFormat::UNKNOWN_VERSION == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf), static_cast<int>(dictSize))) { AKLOGE("DICT: dictionary format is unknown, bad magic number"); #ifdef USE_MMAP_FOR_DICTIONARY Loading Loading @@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; const void *dictBuf = dictionary->getDict(); const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf(); if (!dictBuf) return; #ifdef USE_MMAP_FOR_DICTIONARY releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(), Loading
native/jni/src/bigram_dictionary.cpp +7 −5 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ #include "bigram_dictionary.h" #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" Loading @@ -28,7 +29,8 @@ namespace latinime { BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) { BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo) : mBinaryDictionaryInfo(binaryDictionaryInfo) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } Loading Loading @@ -103,7 +105,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams Loading Loading @@ -149,7 +151,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return 0; const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength, forceLowerCaseSearch); Loading @@ -170,7 +172,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (0 == pos) { Loading Loading @@ -209,7 +211,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { const uint8_t *const root = DICT_ROOT; const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; Loading
native/jni/src/bigram_dictionary.h +6 −2 Original line number Diff line number Diff line Loading @@ -24,9 +24,12 @@ namespace latinime { class BinaryDictionaryInfo; class BigramDictionary { public: BigramDictionary(const uint8_t *const streamStart); BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, Loading @@ -35,13 +38,14 @@ class BigramDictionary { ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); void addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; const uint8_t *const DICT_ROOT; const BinaryDictionaryInfo *const mBinaryDictionaryInfo; // TODO: Re-implement proximity correction for bigram correction static const int MAX_ALTERNATIVES = 1; }; Loading
native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +54 −38 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_utils.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/multi_bigram_map.h" Loading @@ -32,20 +33,23 @@ namespace latinime { // Node initialization utils // /////////////////////////////// /* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot, const int prevWordNodePos, DicNode *newRootNode) { int curPos = rootPos; /* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int prevWordNodePos, DicNode *const newRootNode) { int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos, const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) { int curPos = rootPos; /*static */ void DicNodeUtils::initAsRootWithPreviousWord( const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNode *const prevWordLastNode, DicNode *const newRootNode) { int curPos = binaryDictionaryInfo->getRootPosition(); const int pos = curPos; const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &curPos); const int childrenPos = curPos; newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); } Loading @@ -71,16 +75,19 @@ namespace latinime { } /* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { int nextPos = pos; const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); int codePoint = BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos); ASSERT(NOT_A_CODE_POINT != codePoint); const int nodeCodePoint = codePoint; // TODO: optimize this Loading @@ -90,7 +97,8 @@ namespace latinime { do { const int nextCodePoint = hasMultipleChars ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT; ? BinaryFormat::getCodePointAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); if (!isLastChar) { additionalWordBuf[additionalSubwordLength++] = nextCodePoint; Loading @@ -98,12 +106,14 @@ namespace latinime { codePoint = nextCodePoint; } while (NOT_A_CODE_POINT != codePoint); const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1; const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer( binaryDictionaryInfo->getDictRoot(), pos) : -1; pos = BinaryFormat::skipProbability(flags, pos); int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0; int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition( binaryDictionaryInfo->getDictRoot(), flags, pos) : 0; const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos); const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes( binaryDictionaryInfo->getDictRoot(), flags, pos); if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { return siblingPos; Loading @@ -111,8 +121,8 @@ namespace latinime { if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { return siblingPos; } const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0; const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer( binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0; childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); Loading Loading @@ -148,16 +158,18 @@ namespace latinime { } /* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { const int terminalDepth = dicNode->getLeavingDepth(); const int childCount = dicNode->getChildrenCount(); int nextPos = dicNode->getChildrenPos(); for (int i = 0; i < childCount; i++) { const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { // All code points have been found. break; Loading @@ -165,14 +177,15 @@ namespace latinime { } } /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, DicNodeVector *childDicNodes) { getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes); /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) { getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes); } /* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, DicNodeVector *childDicNodes) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, DicNodeVector *childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { return; } Loading @@ -180,9 +193,9 @@ namespace latinime { DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly, childDicNodes); } else { DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex, exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes); DicNodeUtils::createAndGetAllLeavingChildNodes( dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes); } } Loading @@ -192,19 +205,21 @@ namespace latinime { /** * Computes the combined bigram / unigram cost for the given dicNode. */ /* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot, /* static */ float DicNodeUtils::getBigramNodeImprobability( const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { if (node->isImpossibleBigramWord()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap); const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. const float cost = static_cast<float>(MAX_PROBABILITY - probability) / static_cast<float>(MAX_PROBABILITY); return cost; } /* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot, /* static */ int DicNodeUtils::getBigramNodeProbability( const BinaryDictionaryInfo *const binaryDictionaryInfo, const DicNode *const node, MultiBigramMap *multiBigramMap) { const int unigramProbability = node->getProbability(); const int wordPos = node->getPos(); Loading @@ -215,9 +230,10 @@ namespace latinime { } if (multiBigramMap) { return multiBigramMap->getBigramProbability( dicRoot, prevWordPos, wordPos, unigramProbability); binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); } return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability); return BinaryFormat::getBigramProbability( binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability); } /////////////////////////////////////// Loading