Loading native/src/correction.cpp +6 −51 Original line number Diff line number Diff line Loading @@ -269,7 +269,7 @@ bool Correction::needsToPrune() const { // TODO: use edit distance here return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance // Allow one char longer word for missing character || (!mDoAutoCompletion && (mOutputIndex + 1 >= mInputLength)); || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); } void Correction::addCharToCurrentWord(const int32_t c) { Loading Loading @@ -555,55 +555,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( Correction::~Correction() { } ///////////////////////// // static inline utils // ///////////////////////// static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); } static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; inline static void multiplyIntCapped(const int multiplier, int *base) { const int temp = *base; if (temp != S_INT_MAX) { // Branch if multiplier == 2 for the optimization if (multiplier == 2) { *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; } else { // TODO: This overflow check gives a wrong answer when, for example, // temp = 2^16 + 1 and multiplier = 2^17 + 1. // Fix this behavior. const int tempRetval = temp * multiplier; *base = tempRetval >= temp ? tempRetval : S_INT_MAX; } } } inline static int powerIntCapped(const int base, const int n) { if (n <= 0) return 1; if (base == 2) { return n < 31 ? 1 << n : S_INT_MAX; } else { int ret = base; for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); return ret; } } inline static void multiplyRate(const int rate, int *freq) { if (*freq != S_INT_MAX) { if (*freq > 1000000) { *freq /= 100; multiplyIntCapped(rate, freq); } else { multiplyIntCapped(rate, freq); *freq /= 100; } } } inline static int getQuoteCount(const unsigned short* word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { Loading Loading @@ -939,7 +890,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } if (isSpaceProximity) { multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); } else { multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); } if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); Loading native/src/correction.h +49 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,55 @@ class Correction { NOT_ON_TERMINAL } CorrectionType; ///////////////////////// // static inline utils // ///////////////////////// static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); } static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; inline static void multiplyIntCapped(const int multiplier, int *base) { const int temp = *base; if (temp != S_INT_MAX) { // Branch if multiplier == 2 for the optimization if (multiplier == 2) { *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; } else { // TODO: This overflow check gives a wrong answer when, for example, // temp = 2^16 + 1 and multiplier = 2^17 + 1. // Fix this behavior. const int tempRetval = temp * multiplier; *base = tempRetval >= temp ? tempRetval : S_INT_MAX; } } } inline static int powerIntCapped(const int base, const int n) { if (n <= 0) return 1; if (base == 2) { return n < 31 ? 1 << n : S_INT_MAX; } else { int ret = base; for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); return ret; } } inline static void multiplyRate(const int rate, int *freq) { if (*freq != S_INT_MAX) { if (*freq > 1000000) { *freq /= 100; multiplyIntCapped(rate, freq); } else { multiplyIntCapped(rate, freq); *freq /= 100; } } } Correction(const int typedLetterMultiplier, const int fullWordMultiplier); void initCorrection( const ProximityInfo *pi, const int inputLength, const int maxWordLength); Loading native/src/defines.h +4 −0 Original line number Diff line number Diff line Loading @@ -189,6 +189,7 @@ static void prof_out(void) { #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58 #define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70 Loading Loading @@ -222,6 +223,9 @@ static void prof_out(void) { #define MAX_DEPTH_MULTIPLIER 3 #define FIRST_WORD_INDEX 1 #define SECOND_WORD_INDEX 2 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German // word in the dictionary #define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5 Loading native/src/unigram_dictionary.cpp +89 −39 Original line number Diff line number Diff line Loading @@ -159,19 +159,26 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, } PROF_START(20); if (DEBUG_DICT) { double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Max normalized score = %f", ns); } const int suggestedWordsCount = queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { #ifdef FLAG_DBG short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; AKLOGI("%s %i", s, frequencies[j]); #endif } } PROF_END(20); Loading Loading @@ -205,6 +212,13 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(4); // Note: This line is intentionally left blank bool hasAutoCorrectionCandidate = false; WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); if (masterQueue->size() > 0) { double nsForMaster = masterQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); } PROF_END(4); PROF_START(5); Loading @@ -216,7 +230,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, AKLOGI("--- Suggest missing space characters %d", i); } getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); useFullEditDistance, inputLength, i, correction, queuePool, hasAutoCorrectionCandidate); } } PROF_END(5); Loading @@ -236,7 +251,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); useFullEditDistance, inputLength, i, correction, queuePool, hasAutoCorrectionCandidate); } } } Loading Loading @@ -281,12 +297,12 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors) { const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, Loading @@ -305,7 +321,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, correction, &childCount, &firstChildPos, &siblingPos, queuePool); correction, &childCount, &firstChildPos, &siblingPos, queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); Loading @@ -323,31 +340,32 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */, correction, queuePool); correction, queuePool, hasAutoCorrectionCandidate); } void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos, correction, queuePool); correction, queuePool, hasAutoCorrectionCandidate); } inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex) { const int inputIndex = correction->getInputIndex(); const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; int wordLength; unsigned short* wordPointer; if (addToMasterQueue) { if ((currentWordIndex == 1) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq != NOT_A_FREQUENCY) { Loading Loading @@ -376,9 +394,14 @@ inline void UnigramDictionary::onTerminal(const int freq, // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH // or more length. if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { // TODO: Check the validity of "inputIndex == wordLength" //if (addToSubQueue && inputIndex == wordLength) { WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); WordsPriorityQueue *subQueue; if (currentWordIndex == 1) { subQueue = queuePool->getSubQueue1(inputIndex); } else if (currentWordIndex == 2) { subQueue = queuePool->getSubQueue2(inputIndex); } else { return; } const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, inputIndex); addWord(wordPointer, wordLength, finalFreq, subQueue); Loading @@ -388,17 +411,21 @@ inline void UnigramDictionary::onTerminal(const int freq, void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { if (inputLength >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16 assert(MAX_PROXIMITY_CHARS == 16); } initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const bool isSpaceProximity = spaceProximityPos >= 0; // First word Loading @@ -411,26 +438,22 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (firstFreq > 0) { firstOutputWordLength = firstInputWordLength; firstOutputWord = mWord; } else { if (masterQueue->size() > 0) { double nsForMaster = masterQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) { // Do nothing if the highest suggestion exceeds the threshold. return; } } } else if (!hasAutoCorrectionCandidate) { WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstInputWordLength); if (firstWordQueue->size() < 1) { if (!firstWordQueue || firstWordQueue->size() < 1) { return; } int score = 0; const double ns = firstWordQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), firstInputWordLength, &firstOutputWord, &score, &firstOutputWordLength); if (DEBUG_DICT) { AKLOGI("NS1 = %f, Score = %d", ns, score); } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) { if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || firstOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return; } firstFreq = score >> (firstOutputWordLength Loading @@ -456,14 +479,6 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo outputWord[firstOutputWordLength] = SPACE; outputWordLength = firstOutputWordLength + 1; //const int outputWordLength = firstOutputWordLength + secondWordLength + 1; // Space proximity preparation //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstOutputWordLength, //subQueue, correction); //getSuggestionCandidates(useFullEditDistance, firstOutputWordLength, correction, subQueue, //false, MAX_ERRORS_FOR_TWO_WORDS); // Second word const int secondInputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) Loading @@ -478,9 +493,42 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (secondFreq > 0) { secondOutputWordLength = secondInputWordLength; secondOutputWord = mWord; } else if (!hasAutoCorrectionCandidate) { const int offset = secondInputWordStartPos; initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], codes + offset * MAX_PROXIMITY_CHARS, secondInputWordLength, correction); queuePool->clearSubQueue2(); getSuggestionCandidates(useFullEditDistance, secondInputWordLength, correction, queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, SECOND_WORD_INDEX); if (DEBUG_DICT) { AKLOGI("Dump second word candidates %d", secondInputWordLength); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { queuePool->getSubQueue2(i)->dumpTopWord(); } } WordsPriorityQueue* secondWordQueue = queuePool->getSubQueue2(secondInputWordLength); if (!secondWordQueue || secondWordQueue->size() < 1) { return; } int score = 0; const double ns = secondWordQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), secondInputWordLength, &secondOutputWord, &score, &secondOutputWordLength); if (DEBUG_DICT) { AKLOGI("NS2 = %f, Score = %d", ns, score); } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || secondOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return; } secondFreq = score >> (secondOutputWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { DUMP_WORD(secondOutputWord, secondOutputWordLength); AKLOGI("Second freq: %d", secondFreq); } Loading Loading @@ -742,7 +790,8 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } Loading Loading @@ -823,7 +872,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. Loading native/src/unigram_dictionary.h +10 −6 Original line number Diff line number Diff line Loading @@ -99,11 +99,13 @@ class UnigramDictionary { const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors); WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex); void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Loading @@ -111,18 +113,20 @@ class UnigramDictionary { void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool); WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue); Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool); int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, Loading Loading
native/src/correction.cpp +6 −51 Original line number Diff line number Diff line Loading @@ -269,7 +269,7 @@ bool Correction::needsToPrune() const { // TODO: use edit distance here return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance // Allow one char longer word for missing character || (!mDoAutoCompletion && (mOutputIndex + 1 >= mInputLength)); || (!mDoAutoCompletion && (mOutputIndex > mInputLength)); } void Correction::addCharToCurrentWord(const int32_t c) { Loading Loading @@ -555,55 +555,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( Correction::~Correction() { } ///////////////////////// // static inline utils // ///////////////////////// static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); } static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; inline static void multiplyIntCapped(const int multiplier, int *base) { const int temp = *base; if (temp != S_INT_MAX) { // Branch if multiplier == 2 for the optimization if (multiplier == 2) { *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; } else { // TODO: This overflow check gives a wrong answer when, for example, // temp = 2^16 + 1 and multiplier = 2^17 + 1. // Fix this behavior. const int tempRetval = temp * multiplier; *base = tempRetval >= temp ? tempRetval : S_INT_MAX; } } } inline static int powerIntCapped(const int base, const int n) { if (n <= 0) return 1; if (base == 2) { return n < 31 ? 1 << n : S_INT_MAX; } else { int ret = base; for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); return ret; } } inline static void multiplyRate(const int rate, int *freq) { if (*freq != S_INT_MAX) { if (*freq > 1000000) { *freq /= 100; multiplyIntCapped(rate, freq); } else { multiplyIntCapped(rate, freq); *freq /= 100; } } } inline static int getQuoteCount(const unsigned short* word, const int length) { int quoteCount = 0; for (int i = 0; i < length; ++i) { Loading Loading @@ -939,7 +890,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } if (isSpaceProximity) { multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq); } else { multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); } if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); Loading
native/src/correction.h +49 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,55 @@ class Correction { NOT_ON_TERMINAL } CorrectionType; ///////////////////////// // static inline utils // ///////////////////////// static const int TWO_31ST_DIV_255 = S_INT_MAX / 255; static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) { return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX); } static const int TWO_31ST_DIV_2 = S_INT_MAX / 2; inline static void multiplyIntCapped(const int multiplier, int *base) { const int temp = *base; if (temp != S_INT_MAX) { // Branch if multiplier == 2 for the optimization if (multiplier == 2) { *base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX; } else { // TODO: This overflow check gives a wrong answer when, for example, // temp = 2^16 + 1 and multiplier = 2^17 + 1. // Fix this behavior. const int tempRetval = temp * multiplier; *base = tempRetval >= temp ? tempRetval : S_INT_MAX; } } } inline static int powerIntCapped(const int base, const int n) { if (n <= 0) return 1; if (base == 2) { return n < 31 ? 1 << n : S_INT_MAX; } else { int ret = base; for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret); return ret; } } inline static void multiplyRate(const int rate, int *freq) { if (*freq != S_INT_MAX) { if (*freq > 1000000) { *freq /= 100; multiplyIntCapped(rate, freq); } else { multiplyIntCapped(rate, freq); *freq /= 100; } } } Correction(const int typedLetterMultiplier, const int fullWordMultiplier); void initCorrection( const ProximityInfo *pi, const int inputLength, const int maxWordLength); Loading
native/src/defines.h +4 −0 Original line number Diff line number Diff line Loading @@ -189,6 +189,7 @@ static void prof_out(void) { #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58 #define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70 Loading Loading @@ -222,6 +223,9 @@ static void prof_out(void) { #define MAX_DEPTH_MULTIPLIER 3 #define FIRST_WORD_INDEX 1 #define SECOND_WORD_INDEX 2 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German // word in the dictionary #define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5 Loading
native/src/unigram_dictionary.cpp +89 −39 Original line number Diff line number Diff line Loading @@ -159,19 +159,26 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, } PROF_START(20); if (DEBUG_DICT) { double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Max normalized score = %f", ns); } const int suggestedWordsCount = queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords); if (DEBUG_DICT) { double ns = queuePool->getMasterQueue()->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0); ns += 0; AKLOGI("Returning %d words", suggestedWordsCount); /// Print the returned words for (int j = 0; j < suggestedWordsCount; ++j) { #ifdef FLAG_DBG short unsigned int* w = outWords + j * MAX_WORD_LENGTH; char s[MAX_WORD_LENGTH]; for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i]; AKLOGI("%s %i", s, frequencies[j]); #endif } } PROF_END(20); Loading Loading @@ -205,6 +212,13 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(4); // Note: This line is intentionally left blank bool hasAutoCorrectionCandidate = false; WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); if (masterQueue->size() > 0) { double nsForMaster = masterQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); } PROF_END(4); PROF_START(5); Loading @@ -216,7 +230,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, AKLOGI("--- Suggest missing space characters %d", i); } getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); useFullEditDistance, inputLength, i, correction, queuePool, hasAutoCorrectionCandidate); } } PROF_END(5); Loading @@ -236,7 +251,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } if (proximityInfo->hasSpaceProximity(x, y)) { getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, i, correction, queuePool); useFullEditDistance, inputLength, i, correction, queuePool, hasAutoCorrectionCandidate); } } } Loading Loading @@ -281,12 +297,12 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool) { initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors) { const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, Loading @@ -305,7 +321,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, correction, &childCount, &firstChildPos, &siblingPos, queuePool); correction, &childCount, &firstChildPos, &siblingPos, queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); Loading @@ -323,31 +340,32 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */, correction, queuePool); correction, queuePool, hasAutoCorrectionCandidate); } void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos, correction, queuePool); correction, queuePool, hasAutoCorrectionCandidate); } inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex) { const int inputIndex = correction->getInputIndex(); const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; int wordLength; unsigned short* wordPointer; if (addToMasterQueue) { if ((currentWordIndex == 1) && addToMasterQueue) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq != NOT_A_FREQUENCY) { Loading Loading @@ -376,9 +394,14 @@ inline void UnigramDictionary::onTerminal(const int freq, // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH // or more length. if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) { // TODO: Check the validity of "inputIndex == wordLength" //if (addToSubQueue && inputIndex == wordLength) { WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); WordsPriorityQueue *subQueue; if (currentWordIndex == 1) { subQueue = queuePool->getSubQueue1(inputIndex); } else if (currentWordIndex == 2) { subQueue = queuePool->getSubQueue2(inputIndex); } else { return; } const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength, inputIndex); addWord(wordPointer, wordLength, finalFreq, subQueue); Loading @@ -388,17 +411,21 @@ inline void UnigramDictionary::onTerminal(const int freq, void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { if (inputLength >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); // MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16 assert(MAX_PROXIMITY_CHARS == 16); } initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const bool isSpaceProximity = spaceProximityPos >= 0; // First word Loading @@ -411,26 +438,22 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (firstFreq > 0) { firstOutputWordLength = firstInputWordLength; firstOutputWord = mWord; } else { if (masterQueue->size() > 0) { double nsForMaster = masterQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) { // Do nothing if the highest suggestion exceeds the threshold. return; } } } else if (!hasAutoCorrectionCandidate) { WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstInputWordLength); if (firstWordQueue->size() < 1) { if (!firstWordQueue || firstWordQueue->size() < 1) { return; } int score = 0; const double ns = firstWordQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), firstInputWordLength, &firstOutputWord, &score, &firstOutputWordLength); if (DEBUG_DICT) { AKLOGI("NS1 = %f, Score = %d", ns, score); } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) { if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || firstOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return; } firstFreq = score >> (firstOutputWordLength Loading @@ -456,14 +479,6 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo outputWord[firstOutputWordLength] = SPACE; outputWordLength = firstOutputWordLength + 1; //const int outputWordLength = firstOutputWordLength + secondWordLength + 1; // Space proximity preparation //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstOutputWordLength, //subQueue, correction); //getSuggestionCandidates(useFullEditDistance, firstOutputWordLength, correction, subQueue, //false, MAX_ERRORS_FOR_TWO_WORDS); // Second word const int secondInputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) Loading @@ -478,9 +493,42 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo if (secondFreq > 0) { secondOutputWordLength = secondInputWordLength; secondOutputWord = mWord; } else if (!hasAutoCorrectionCandidate) { const int offset = secondInputWordStartPos; initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], codes + offset * MAX_PROXIMITY_CHARS, secondInputWordLength, correction); queuePool->clearSubQueue2(); getSuggestionCandidates(useFullEditDistance, secondInputWordLength, correction, queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, SECOND_WORD_INDEX); if (DEBUG_DICT) { AKLOGI("Dump second word candidates %d", secondInputWordLength); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { queuePool->getSubQueue2(i)->dumpTopWord(); } } WordsPriorityQueue* secondWordQueue = queuePool->getSubQueue2(secondInputWordLength); if (!secondWordQueue || secondWordQueue->size() < 1) { return; } int score = 0; const double ns = secondWordQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), secondInputWordLength, &secondOutputWord, &score, &secondOutputWordLength); if (DEBUG_DICT) { AKLOGI("NS2 = %f, Score = %d", ns, score); } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || secondOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return; } secondFreq = score >> (secondOutputWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { DUMP_WORD(secondOutputWord, secondOutputWordLength); AKLOGI("Second freq: %d", secondFreq); } Loading Loading @@ -742,7 +790,8 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } Loading Loading @@ -823,7 +872,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. Loading
native/src/unigram_dictionary.h +10 −6 Original line number Diff line number Diff line Loading @@ -99,11 +99,13 @@ class UnigramDictionary { const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors); WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex); void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Loading @@ -111,18 +113,20 @@ class UnigramDictionary { void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool); WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate); void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue); Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, const int currentWordIndex); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool); int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, Loading