Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 29dc8061 authored by satok's avatar satok
Browse files

Prepair for advanced two words error correction

Change-Id: I4c8a21f0f6e349ddafd9b402583321a60855cfe8
parent f611f24b
Loading
Loading
Loading
Loading
+95 −1
Original line number Diff line number Diff line
@@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne

inline static int getCurrentEditDistance(
        int *editDistanceTable, const int inputLength, const int outputLength) {
    if (DEBUG_DICT) {
    if (DEBUG_EDIT_DISTANCE) {
        AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
    }
    return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
@@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    return totalFreq;
}

/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
        const int firstFreq, const int secondFreq, const Correction* correction,
        const unsigned short *word) {
    const int spaceProximityPos = correction->mSpaceProximityPos;
    const int missingSpacePos = correction->mMissingSpacePos;
    if (DEBUG_DICT) {
        int inputCount = 0;
        if (spaceProximityPos >= 0) ++inputCount;
        if (missingSpacePos >= 0) ++inputCount;
        assert(inputCount <= 1);
    }
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int inputLength = correction->mInputLength;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

    bool firstCapitalizedWordDemotion = false;
    if (firstWordLength >= 2) {
        firstCapitalizedWordDemotion = isUpperCase(word[0]);
    }

    bool secondCapitalizedWordDemotion = false;
    if (secondWordLength >= 2) {
        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
    }

    const bool capitalizedWordDemotion =
            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

    if (DEBUG_DICT_FULL) {
        AKLOGI("Two words: %c, %c, %d",
                word[0], word[firstWordLength + 1], capitalizedWordDemotion);
    }

    if (firstWordLength == 0 || secondWordLength == 0) {
        return 0;
    }
    const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
    int tempFirstFreq = firstFreq;
    multiplyRate(firstDemotionRate, &tempFirstFreq);

    const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
    int tempSecondFreq = secondFreq;
    multiplyRate(secondDemotionRate, &tempSecondFreq);

    const int totalLength = firstWordLength + secondWordLength;

    // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
    // length.
    int totalFreq = tempFirstFreq + tempSecondFreq;

    // This is a workaround to try offsetting the not-enough-demotion which will be done in
    // calcNormalizedScore in Utils.java.
    // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
    // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
    // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
    const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
    multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

    // At this moment, totalFreq is calculated by the following formula:
    // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
    //        * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

    multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

    // This is another workaround to offset the demotion which will be done in
    // calcNormalizedScore in Utils.java.
    // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
    // the same amount because we already have adjusted the synthetic freq of this "missing or
    // mistyped space" suggestion candidate above in this method.
    const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
    multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

    if (isSpaceProximity) {
        // A word pair with one space proximity correction
        if (DEBUG_DICT) {
            AKLOGI("Found a word pair with space proximity correction.");
        }
        multiplyIntCapped(typedLetterMultiplier, &totalFreq);
        multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
    }

    multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

    if (capitalizedWordDemotion) {
        multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
    }

    return totalFreq;
}

/* Damerau-Levenshtein distance */
inline static int editDistanceInternal(
        int* editDistanceTable, const unsigned short* before,
+2 −0
Original line number Diff line number Diff line
@@ -100,6 +100,8 @@ class Correction {
                const int freq, int *editDistanceTable, const Correction* correction);
        static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                const Correction* correction, const unsigned short *word);
        static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
                const Correction* correction, const unsigned short *word);
        static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
                const unsigned short* after, const int afterLength, const int score);
        static int editDistance(const unsigned short* before,
+4 −2
Original line number Diff line number Diff line
@@ -117,8 +117,8 @@ static void prof_out(void) {
#define DEBUG_TRACE DEBUG_DICT_FULL
#define DEBUG_PROXIMITY_INFO false
#define DEBUG_CORRECTION false
#define DEBUG_CORRECTION_FREQ true
#define DEBUG_WORDS_PRIORITY_QUEUE true
#define DEBUG_CORRECTION_FREQ false
#define DEBUG_WORDS_PRIORITY_QUEUE false

#else // FLAG_DBG

@@ -213,6 +213,8 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10

#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f

#define MAX_DEPTH_MULTIPLIER 3

// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
+91 −1
Original line number Diff line number Diff line
@@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
        }
    }
    PROF_END(6);
    if (DEBUG_WORDS_PRIORITY_QUEUE) {
    if (DEBUG_DICT) {
        queuePool->dumpSubQueue1TopSuggestions();
        for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
            WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
            if (queue->size() > 0) {
                WordsPriorityQueue::SuggestedWord* sw = queue->top();
                const int score = sw->mScore;
                const unsigned short* word = sw->mWord;
                const int wordLength = sw->mWordLength;
                double ns = Correction::RankingAlgorithm::calcNormalizedScore(
                        proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
                ns += 0;
                AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
                        (ns > TWO_WORDS_CORRECTION_THRESHOLD));
                DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
                DUMP_WORD(word, wordLength);
            }
        }
    }
}

@@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
    return;
}

void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
        const int  spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
    WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();

    if (DEBUG_DICT) {
        int inputCount = 0;
        if (spaceProximityPos >= 0) ++inputCount;
        if (missingSpacePos >= 0) ++inputCount;
        assert(inputCount <= 1);
    }
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int firstWordStartPos = 0;
    const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    const int secondWordLength = isSpaceProximity
            ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);

    if (inputLength >= MAX_WORD_LENGTH) return;
    if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
            || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
        return;

    const int newWordLength = firstWordLength + secondWordLength + 1;


    // Space proximity preparation
    //WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
    //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
    //correction);
    //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
    //MAX_ERRORS_FOR_TWO_WORDS);

    // Allocating variable length array on stack
    unsigned short word[newWordLength];
    const int firstFreq = getMostFrequentWordLike(
            firstWordStartPos, firstWordLength, proximityInfo, mWord);
    if (DEBUG_DICT) {
        AKLOGI("First freq: %d", firstFreq);
    }
    if (firstFreq <= 0) return;

    for (int i = 0; i < firstWordLength; ++i) {
        word[i] = mWord[i];
    }

    const int secondFreq = getMostFrequentWordLike(
            secondWordStartPos, secondWordLength, proximityInfo, mWord);
    if (DEBUG_DICT) {
        AKLOGI("Second  freq:  %d", secondFreq);
    }
    if (secondFreq <= 0) return;

    word[firstWordLength] = SPACE;
    for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
        word[i] = mWord[i - firstWordLength - 1];
    }

    // TODO: Remove initSuggestions and correction->setCorrectionParams
    initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);

    correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
            -1 /* transposedPos */, spaceProximityPos, missingSpacePos,
            useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
    const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
    if (DEBUG_DICT) {
        AKLOGI("Split two words:  %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
    }
    addWord(word, newWordLength, pairFreq, masterQueue);
    return;
}

// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
// interface.
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
+4 −0
Original line number Diff line number Diff line
@@ -104,6 +104,10 @@ class UnigramDictionary {
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
            const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
    void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
            const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
    void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const bool useFullEditDistance,
            const int inputLength, const int missingSpacePos, Correction *correction,
Loading