Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ff020671 authored by satok's avatar satok Committed by Android (Google) Code Review
Browse files

Merge "Prepair for advanced two words error correction"

parents aed5819a 29dc8061
Loading
Loading
Loading
Loading
+95 −1
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne

inline static int getCurrentEditDistance(
        int *editDistanceTable, const int inputLength, const int outputLength) {
    if (DEBUG_DICT) {
    if (DEBUG_EDIT_DISTANCE) {
        AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
    }
    return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
@@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    return totalFreq;
}

/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
        const int firstFreq, const int secondFreq, const Correction* correction,
        const unsigned short *word) {
    const int spaceProximityPos = correction->mSpaceProximityPos;
    const int missingSpacePos = correction->mMissingSpacePos;
    if (DEBUG_DICT) {
        int inputCount = 0;
        if (spaceProximityPos >= 0) ++inputCount;
        if (missingSpacePos >= 0) ++inputCount;
        assert(inputCount <= 1);
    }
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int inputLength = correction->mInputLength;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

    bool firstCapitalizedWordDemotion = false;
    if (firstWordLength >= 2) {
        firstCapitalizedWordDemotion = isUpperCase(word[0]);
    }

    bool secondCapitalizedWordDemotion = false;
    if (secondWordLength >= 2) {
        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
    }

    const bool capitalizedWordDemotion =
            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

    if (DEBUG_DICT_FULL) {
        AKLOGI("Two words: %c, %c, %d",
                word[0], word[firstWordLength + 1], capitalizedWordDemotion);
    }

    if (firstWordLength == 0 || secondWordLength == 0) {
        return 0;
    }
    const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
    int tempFirstFreq = firstFreq;
    multiplyRate(firstDemotionRate, &tempFirstFreq);

    const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
    int tempSecondFreq = secondFreq;
    multiplyRate(secondDemotionRate, &tempSecondFreq);

    const int totalLength = firstWordLength + secondWordLength;

    // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
    // length.
    int totalFreq = tempFirstFreq + tempSecondFreq;

    // This is a workaround to try offsetting the not-enough-demotion which will be done in
    // calcNormalizedScore in Utils.java.
    // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
    // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
    // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
    const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
    multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

    // At this moment, totalFreq is calculated by the following formula:
    // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
    //        * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

    multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

    // This is another workaround to offset the demotion which will be done in
    // calcNormalizedScore in Utils.java.
    // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
    // the same amount because we already have adjusted the synthetic freq of this "missing or
    // mistyped space" suggestion candidate above in this method.
    const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
    multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

    if (isSpaceProximity) {
        // A word pair with one space proximity correction
        if (DEBUG_DICT) {
            AKLOGI("Found a word pair with space proximity correction.");
        }
        multiplyIntCapped(typedLetterMultiplier, &totalFreq);
        multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
    }

    multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

    if (capitalizedWordDemotion) {
        multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
    }

    return totalFreq;
}

/* Damerau-Levenshtein distance */
inline static int editDistanceInternal(
        int* editDistanceTable, const unsigned short* before,
+2 −0
Original line number Diff line number Diff line
@@ -100,6 +100,8 @@ class Correction {
                const int freq, int *editDistanceTable, const Correction* correction);
        static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                const Correction* correction, const unsigned short *word);
        static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
                const Correction* correction, const unsigned short *word);
        static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
                const unsigned short* after, const int afterLength, const int score);
        static int editDistance(const unsigned short* before,
+4 −2
Original line number Diff line number Diff line
@@ -117,8 +117,8 @@ static void prof_out(void) {
#define DEBUG_TRACE DEBUG_DICT_FULL
#define DEBUG_PROXIMITY_INFO false
#define DEBUG_CORRECTION false
#define DEBUG_CORRECTION_FREQ true
#define DEBUG_WORDS_PRIORITY_QUEUE true
#define DEBUG_CORRECTION_FREQ false
#define DEBUG_WORDS_PRIORITY_QUEUE false

#else // FLAG_DBG

@@ -213,6 +213,8 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10

#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f

#define MAX_DEPTH_MULTIPLIER 3

// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
+91 −1
Original line number Diff line number Diff line
@@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
        }
    }
    PROF_END(6);
    if (DEBUG_WORDS_PRIORITY_QUEUE) {
    if (DEBUG_DICT) {
        queuePool->dumpSubQueue1TopSuggestions();
        for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
            WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
            if (queue->size() > 0) {
                WordsPriorityQueue::SuggestedWord* sw = queue->top();
                const int score = sw->mScore;
                const unsigned short* word = sw->mWord;
                const int wordLength = sw->mWordLength;
                double ns = Correction::RankingAlgorithm::calcNormalizedScore(
                        proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
                ns += 0;
                AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
                        (ns > TWO_WORDS_CORRECTION_THRESHOLD));
                DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
                DUMP_WORD(word, wordLength);
            }
        }
    }
}

@@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
    return;
}

void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
        const int  spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
    WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();

    if (DEBUG_DICT) {
        int inputCount = 0;
        if (spaceProximityPos >= 0) ++inputCount;
        if (missingSpacePos >= 0) ++inputCount;
        assert(inputCount <= 1);
    }
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int firstWordStartPos = 0;
    const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    const int secondWordLength = isSpaceProximity
            ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);

    if (inputLength >= MAX_WORD_LENGTH) return;
    if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
            || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
        return;

    const int newWordLength = firstWordLength + secondWordLength + 1;


    // Space proximity preparation
    //WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
    //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
    //correction);
    //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
    //MAX_ERRORS_FOR_TWO_WORDS);

    // Allocating variable length array on stack
    unsigned short word[newWordLength];
    const int firstFreq = getMostFrequentWordLike(
            firstWordStartPos, firstWordLength, proximityInfo, mWord);
    if (DEBUG_DICT) {
        AKLOGI("First freq: %d", firstFreq);
    }
    if (firstFreq <= 0) return;

    for (int i = 0; i < firstWordLength; ++i) {
        word[i] = mWord[i];
    }

    const int secondFreq = getMostFrequentWordLike(
            secondWordStartPos, secondWordLength, proximityInfo, mWord);
    if (DEBUG_DICT) {
        AKLOGI("Second  freq:  %d", secondFreq);
    }
    if (secondFreq <= 0) return;

    word[firstWordLength] = SPACE;
    for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
        word[i] = mWord[i - firstWordLength - 1];
    }

    // TODO: Remove initSuggestions and correction->setCorrectionParams
    initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);

    correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
            -1 /* transposedPos */, spaceProximityPos, missingSpacePos,
            useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
    const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
    if (DEBUG_DICT) {
        AKLOGI("Split two words:  %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
    }
    addWord(word, newWordLength, pairFreq, masterQueue);
    return;
}

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
// interface.
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
+4 −0
Original line number Diff line number Diff line
@@ -104,6 +104,10 @@ class UnigramDictionary {
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
            const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
    void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
            const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
    void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const bool useFullEditDistance,
            const int inputLength, const int missingSpacePos, Correction *correction,
Loading