Loading native/src/correction.cpp +95 −1 Original line number Diff line number Diff line Loading @@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne inline static int getCurrentEditDistance( int *editDistanceTable, const int inputLength, const int outputLength) { if (DEBUG_DICT) { if (DEBUG_EDIT_DISTANCE) { AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); } return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; Loading Loading @@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( return totalFreq; } /* static */ int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld( const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word) { const int spaceProximityPos = correction->mSpaceProximityPos; const int missingSpacePos = correction->mMissingSpacePos; if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); } const bool isSpaceProximity = spaceProximityPos >= 0; const int inputLength = correction->mInputLength; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) : (inputLength - missingSpacePos); const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; if (firstWordLength >= 2) { firstCapitalizedWordDemotion = isUpperCase(word[0]); } bool secondCapitalizedWordDemotion = false; if (secondWordLength >= 2) { secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); } const bool capitalizedWordDemotion = firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; if (DEBUG_DICT_FULL) { AKLOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); } if (firstWordLength == 0 || secondWordLength == 0) { return 0; } const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); int tempFirstFreq = firstFreq; multiplyRate(firstDemotionRate, &tempFirstFreq); const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); int tempSecondFreq = secondFreq; multiplyRate(secondDemotionRate, &tempSecondFreq); const int totalLength = firstWordLength + secondWordLength; // Promote pairFreq with multiplying by 2, because the word length is the same as the typed // length. int totalFreq = tempFirstFreq + tempSecondFreq; // This is a workaround to try offsetting the not-enough-demotion which will be done in // calcNormalizedScore in Utils.java. // In calcNormalizedScore the score will be demoted by (1 - 1 / length) // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); // At this moment, totalFreq is calculated by the following formula: // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); // This is another workaround to offset the demotion which will be done in // calcNormalizedScore in Utils.java. // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote // the same amount because we already have adjusted the synthetic freq of this "missing or // mistyped space" suggestion candidate above in this method. const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); } return totalFreq; } /* Damerau-Levenshtein distance */ inline static int editDistanceInternal( int* editDistanceTable, const unsigned short* before, Loading native/src/correction.h +2 −0 Original line number Diff line number Diff line Loading @@ -100,6 +100,8 @@ class Correction { const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); static double calcNormalizedScore(const unsigned short* before, const int beforeLength, const unsigned short* after, const int afterLength, const int score); static int editDistance(const unsigned short* before, Loading native/src/defines.h +4 −2 Original line number Diff line number Diff line Loading @@ -117,8 +117,8 @@ static void prof_out(void) { #define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false #define DEBUG_CORRECTION_FREQ true #define DEBUG_WORDS_PRIORITY_QUEUE true #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG Loading Loading @@ -213,6 +213,8 @@ static void prof_out(void) { #define SUB_QUEUE_MAX_WORDS 1 #define SUB_QUEUE_MAX_COUNT 10 #define TWO_WORDS_CORRECTION_THRESHOLD 0.22f #define MAX_DEPTH_MULTIPLIER 3 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German Loading native/src/unigram_dictionary.cpp +91 −1 Original line number Diff line number Diff line Loading @@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); if (DEBUG_WORDS_PRIORITY_QUEUE) { if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { WordsPriorityQueue* queue = queuePool->getSubQueue1(i); if (queue->size() > 0) { WordsPriorityQueue::SuggestedWord* sw = queue->top(); const int score = sw->mScore; const unsigned short* word = sw->mWord; const int wordLength = sw->mWordLength; double ns = Correction::RankingAlgorithm::calcNormalizedScore( proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); ns += 0; AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, (ns > TWO_WORDS_CORRECTION_THRESHOLD)); DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); DUMP_WORD(word, wordLength); } } } } Loading Loading @@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo return; } void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); } const bool isSpaceProximity = spaceProximityPos >= 0; const int firstWordStartPos = 0; const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) : (inputLength - missingSpacePos); if (inputLength >= MAX_WORD_LENGTH) return; if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) return; const int newWordLength = firstWordLength + secondWordLength + 1; // Space proximity preparation //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, //correction); //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, //MAX_ERRORS_FOR_TWO_WORDS); // Allocating variable length array on stack unsigned short word[newWordLength]; const int firstFreq = getMostFrequentWordLike( firstWordStartPos, firstWordLength, proximityInfo, mWord); if (DEBUG_DICT) { AKLOGI("First freq: %d", firstFreq); } if (firstFreq <= 0) return; for (int i = 0; i < firstWordLength; ++i) { word[i] = mWord[i]; } const int secondFreq = getMostFrequentWordLike( secondWordStartPos, secondWordLength, proximityInfo, mWord); if (DEBUG_DICT) { AKLOGI("Second freq: %d", secondFreq); } if (secondFreq <= 0) return; word[firstWordLength] = SPACE; for (int i = (firstWordLength + 1); i < newWordLength; ++i) { word[i] = mWord[i - firstWordLength - 1]; } // TODO: Remove initSuggestions and correction->setCorrectionParams initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, missingSpacePos, useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); if (DEBUG_DICT) { AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); } addWord(word, newWordLength, pairFreq, masterQueue); return; } // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, Loading native/src/unigram_dictionary.h +4 −0 Original line number Diff line number Diff line Loading @@ -104,6 +104,10 @@ class UnigramDictionary { const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, Loading Loading
native/src/correction.cpp +95 −1 Original line number Diff line number Diff line Loading @@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne inline static int getCurrentEditDistance( int *editDistanceTable, const int inputLength, const int outputLength) { if (DEBUG_DICT) { if (DEBUG_EDIT_DISTANCE) { AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength); } return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1]; Loading Loading @@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( return totalFreq; } /* static */ int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld( const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word) { const int spaceProximityPos = correction->mSpaceProximityPos; const int missingSpacePos = correction->mMissingSpacePos; if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); } const bool isSpaceProximity = spaceProximityPos >= 0; const int inputLength = correction->mInputLength; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) : (inputLength - missingSpacePos); const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; bool firstCapitalizedWordDemotion = false; if (firstWordLength >= 2) { firstCapitalizedWordDemotion = isUpperCase(word[0]); } bool secondCapitalizedWordDemotion = false; if (secondWordLength >= 2) { secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); } const bool capitalizedWordDemotion = firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; if (DEBUG_DICT_FULL) { AKLOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); } if (firstWordLength == 0 || secondWordLength == 0) { return 0; } const int firstDemotionRate = 100 - 100 / (firstWordLength + 1); int tempFirstFreq = firstFreq; multiplyRate(firstDemotionRate, &tempFirstFreq); const int secondDemotionRate = 100 - 100 / (secondWordLength + 1); int tempSecondFreq = secondFreq; multiplyRate(secondDemotionRate, &tempSecondFreq); const int totalLength = firstWordLength + secondWordLength; // Promote pairFreq with multiplying by 2, because the word length is the same as the typed // length. int totalFreq = tempFirstFreq + tempSecondFreq; // This is a workaround to try offsetting the not-enough-demotion which will be done in // calcNormalizedScore in Utils.java. // In calcNormalizedScore the score will be demoted by (1 - 1 / length) // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length)) const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength); multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq); // At this moment, totalFreq is calculated by the following formula: // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1))) // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1)) multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq); // This is another workaround to offset the demotion which will be done in // calcNormalizedScore in Utils.java. // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote // the same amount because we already have adjusted the synthetic freq of this "missing or // mistyped space" suggestion candidate above in this method. const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength); multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq); if (isSpaceProximity) { // A word pair with one space proximity correction if (DEBUG_DICT) { AKLOGI("Found a word pair with space proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &totalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq); } multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); if (capitalizedWordDemotion) { multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); } return totalFreq; } /* Damerau-Levenshtein distance */ inline static int editDistanceInternal( int* editDistanceTable, const unsigned short* before, Loading
native/src/correction.h +2 −0 Original line number Diff line number Diff line Loading @@ -100,6 +100,8 @@ class Correction { const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq, const Correction* correction, const unsigned short *word); static double calcNormalizedScore(const unsigned short* before, const int beforeLength, const unsigned short* after, const int afterLength, const int score); static int editDistance(const unsigned short* before, Loading
native/src/defines.h +4 −2 Original line number Diff line number Diff line Loading @@ -117,8 +117,8 @@ static void prof_out(void) { #define DEBUG_TRACE DEBUG_DICT_FULL #define DEBUG_PROXIMITY_INFO false #define DEBUG_CORRECTION false #define DEBUG_CORRECTION_FREQ true #define DEBUG_WORDS_PRIORITY_QUEUE true #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false #else // FLAG_DBG Loading Loading @@ -213,6 +213,8 @@ static void prof_out(void) { #define SUB_QUEUE_MAX_WORDS 1 #define SUB_QUEUE_MAX_COUNT 10 #define TWO_WORDS_CORRECTION_THRESHOLD 0.22f #define MAX_DEPTH_MULTIPLIER 3 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German Loading
native/src/unigram_dictionary.cpp +91 −1 Original line number Diff line number Diff line Loading @@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); if (DEBUG_WORDS_PRIORITY_QUEUE) { if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { WordsPriorityQueue* queue = queuePool->getSubQueue1(i); if (queue->size() > 0) { WordsPriorityQueue::SuggestedWord* sw = queue->top(); const int score = sw->mScore; const unsigned short* word = sw->mWord; const int wordLength = sw->mWordLength; double ns = Correction::RankingAlgorithm::calcNormalizedScore( proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); ns += 0; AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, (ns > TWO_WORDS_CORRECTION_THRESHOLD)); DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); DUMP_WORD(word, wordLength); } } } } Loading Loading @@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo return; } void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); if (DEBUG_DICT) { int inputCount = 0; if (spaceProximityPos >= 0) ++inputCount; if (missingSpacePos >= 0) ++inputCount; assert(inputCount <= 1); } const bool isSpaceProximity = spaceProximityPos >= 0; const int firstWordStartPos = 0; const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) : (inputLength - missingSpacePos); if (inputLength >= MAX_WORD_LENGTH) return; if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength) return; const int newWordLength = firstWordLength + secondWordLength + 1; // Space proximity preparation //WordsPriorityQueue *subQueue = queuePool->getSubQueue1(); //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue, //correction); //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false, //MAX_ERRORS_FOR_TWO_WORDS); // Allocating variable length array on stack unsigned short word[newWordLength]; const int firstFreq = getMostFrequentWordLike( firstWordStartPos, firstWordLength, proximityInfo, mWord); if (DEBUG_DICT) { AKLOGI("First freq: %d", firstFreq); } if (firstFreq <= 0) return; for (int i = 0; i < firstWordLength; ++i) { word[i] = mWord[i]; } const int secondFreq = getMostFrequentWordLike( secondWordStartPos, secondWordLength, proximityInfo, mWord); if (DEBUG_DICT) { AKLOGI("Second freq: %d", secondFreq); } if (secondFreq <= 0) return; word[firstWordLength] = SPACE; for (int i = (firstWordLength + 1); i < newWordLength; ++i) { word[i] = mWord[i - firstWordLength - 1]; } // TODO: Remove initSuggestions and correction->setCorrectionParams initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, missingSpacePos, useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word); if (DEBUG_DICT) { AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); } addWord(word, newWordLength, pairFreq, masterQueue); return; } // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous // interface. inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex, Loading
native/src/unigram_dictionary.h +4 −0 Original line number Diff line number Diff line Loading @@ -104,6 +104,10 @@ class UnigramDictionary { const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool); void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int missingSpacePos, Correction *correction, Loading