Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 54af64ae authored by satok's avatar satok
Browse files

Two words error correction with other error correction for the first word

+1      26
-1       5
+2       0
-2       0
+3       0
-3       0
+4       9
-4      25
+5      20
-5      21
+6      13
-6       6
+7      15
-7      26

Change-Id: Iad682d417a6bb42b11ca6e60157698ca66fef3ff
parent 4dd48372
Loading
Loading
Loading
Loading
+40 −27
Original line number Diff line number Diff line
@@ -39,15 +39,15 @@ inline static void initEditDistance(int *editDistanceTable) {
    }
}

inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int inputLength,
        const int outputLength) {
inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
        const int editDistanceTableWidth, const int outputLength) {
    if (DEBUG_DICT) {
        AKLOGI("EditDistanceTable");
        for (int i = 0; i <= 10; ++i) {
            int c[11];
            for (int j = 0; j <= 10; ++j) {
                if (j < inputLength + 1 && i < outputLength + 1) {
                    c[j] = (editDistanceTable + i * (inputLength + 1))[j];
                if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
                    c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
                } else {
                    c[j] = -1;
                }
@@ -81,12 +81,12 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
    }
}

inline static int getCurrentEditDistance(
        int *editDistanceTable, const int inputLength, const int outputLength) {
inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
        const int outputLength, const int inputLength) {
    if (DEBUG_EDIT_DISTANCE) {
        AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
    }
    return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
    return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength];
}

//////////////////////
@@ -165,6 +165,16 @@ int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFre
}

int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
    return getFinalFreqInternal(freq, word, wordLength, mInputLength);
}

int Correction::getFinalFreqForSubQueue(const int freq, unsigned short **word, int *wordLength,
        const int inputLength) {
    return getFinalFreqInternal(freq, word, wordLength, inputLength);
}

int Correction::getFinalFreqInternal(const int freq, unsigned short **word, int *wordLength,
        const int inputLength) {
    const int outputIndex = mTerminalOutputIndex;
    const int inputIndex = mTerminalInputIndex;
    *wordLength = outputIndex + 1;
@@ -173,8 +183,9 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
    }

    *word = mWord;
    return Correction::RankingAlgorithm::calculateFinalFreq(
            inputIndex, outputIndex, freq, mEditDistanceTable, this);
    int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq(
            inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength);
    return finalFreq;
}

bool Correction::initProcessState(const int outputIndex) {
@@ -613,9 +624,9 @@ inline static bool isUpperCase(unsigned short c) {

/* static */
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
        const int freq, int* editDistanceTable, const Correction* correction) {
        const int freq, int* editDistanceTable, const Correction* correction,
        const int inputLength) {
    const int excessivePos = correction->getExcessivePos();
    const int inputLength = correction->mInputLength;
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
    const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
    const ProximityInfo *proximityInfo = correction->mProximityInfo;
@@ -640,13 +651,13 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
    const unsigned short* word = correction->mWord;
    const bool skipped = skippedCount > 0;

    const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
    const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
            - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));

    // TODO: Calculate edit distance for transposed and excessive
    int ed = 0;
    if (DEBUG_DICT_FULL) {
        dumpEditDistance10ForDebug(editDistanceTable, inputLength, outputIndex + 1);
        dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength);
    }
    int adjustedProximityMatchedCount = proximityMatchedCount;

@@ -654,22 +665,22 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const

    // TODO: Optimize this.
    if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
        ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1)
                - transposedCount;
        ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength,
                inputLength) - transposedCount;

        const int matchWeight = powerIntCapped(typedLetterMultiplier,
                max(inputLength, outputIndex + 1) - ed);
                max(inputLength, outputLength) - ed);
        multiplyIntCapped(matchWeight, &finalFreq);

        // TODO: Demote further if there are two or more excessive chars with longer user input?
        if (inputLength > outputIndex + 1) {
        if (inputLength > outputLength) {
            multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
        }

        ed = max(0, ed - quoteDiffCount);

        if (transposedCount < 1) {
            if (ed == 1 && (inputLength == outputIndex || inputLength == outputIndex + 2)) {
            if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) {
                // Promote a word with just one skipped or excessive char
                if (sameLength) {
                    multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq);
@@ -681,7 +692,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
                sameLength = true;
            }
        }
        adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
        adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)),
                proximityMatchedCount);
    } else {
        const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
@@ -783,7 +794,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
    // Promotion for an exactly matched word
    if (ed == 0) {
        // Full exact match
        if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) {
        if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
                && quoteDiffCount == 0) {
            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
        }
    }
@@ -828,14 +840,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
    }

    if (DEBUG_DICT_FULL) {
        AKLOGI("calc: %d, %d", outputIndex, sameLength);
        AKLOGI("calc: %d, %d", outputLength, sameLength);
    }

    if (DEBUG_CORRECTION_FREQ) {
        DUMP_WORD(correction->mWord, outputIndex + 1);
        AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount,
                skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength,
                quoteDiffCount, ed, finalFreq);
        DUMP_WORD(correction->mWord, outputLength);
        AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
                skippedCount, transposedCount, excessiveCount, outputLength, lastCharExceeded,
                sameLength, quoteDiffCount, ed, finalFreq);
    }

    return finalFreq;
@@ -881,11 +893,12 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    if (firstWordLength == 0 || secondWordLength == 0) {
        return 0;
    }
    const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
    const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
    int tempFirstFreq = firstFreq;
    multiplyRate(firstDemotionRate, &tempFirstFreq);

    const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
    const int secondDemotionRate = 100
            - TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
    int tempSecondFreq = secondFreq;
    multiplyRate(secondDemotionRate, &tempSecondFreq);

+6 −1
Original line number Diff line number Diff line
@@ -75,6 +75,8 @@ class Correction {
    int getFreqForSplitTwoWords(
            const int firstFreq, const int secondFreq, const unsigned short *word);
    int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
    int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
            const int inputLength);

    CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);

@@ -97,7 +99,8 @@ class Correction {
    class RankingAlgorithm {
     public:
        static int calculateFinalFreq(const int inputIndex, const int depth,
                const int freq, int *editDistanceTable, const Correction* correction);
                const int freq, int *editDistanceTable, const Correction* correction,
                const int inputLength);
        static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                const Correction* correction, const unsigned short *word);
        static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
@@ -122,6 +125,8 @@ class Correction {
            const int32_t c, const bool isTerminal, const bool inputIndexIncremented);
    inline CorrectionType processUnrelatedCorrectionType();
    inline void addCharToCurrentWord(const int32_t c);
    inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength,
            const int inputLength);

    const int TYPED_LETTER_MULTIPLIER;
    const int FULL_WORD_MULTIPLIER;
+6 −2
Original line number Diff line number Diff line
@@ -187,7 +187,7 @@ static void prof_out(void) {
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70
@@ -199,6 +199,8 @@ static void prof_out(void) {
#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
#define TWO_WORDS_CORRECTION_DEMOTION_BASE 80
#define TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER 1
#define ZERO_DISTANCE_PROMOTION_RATE 110
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
@@ -212,8 +214,10 @@ static void prof_out(void) {
// Holds up to 1 candidate for each word
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10
#define SUB_QUEUE_MIN_WORD_LENGTH 4

#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22

#define MAX_DEPTH_MULTIPLIER 3

+73 −37
Original line number Diff line number Diff line
@@ -254,7 +254,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
                        proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
                ns += 0;
                AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
                        (ns > TWO_WORDS_CORRECTION_THRESHOLD));
                        (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD));
                DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
                DUMP_WORD(word, wordLength);
            }
@@ -343,37 +343,27 @@ inline void UnigramDictionary::onTerminal(const int freq,
        WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) {
    const int inputIndex = correction->getInputIndex();
    const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
    if (!addToMasterQueue && !addToSubQueue) {
        return;
    }
    WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
    WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex);

    int wordLength;
    unsigned short* wordPointer;

    if (addToMasterQueue) {
        WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
        const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
        if (finalFreq != NOT_A_FREQUENCY) {
            if (!terminalAttributes.isShortcutOnly()) {
            if (addToMasterQueue) {
                addWord(wordPointer, wordLength, finalFreq, masterQueue);
            }
            // TODO: Check the validity of "inputIndex == wordLength"
            //if (addToSubQueue && inputIndex == wordLength) {
            if (addToSubQueue) {
                addWord(wordPointer, wordLength, finalFreq, subQueue);
            }
        }
        // Please note that the shortcut candidates will be added to the master queue only.
        if (!addToMasterQueue) {
            return;
        }

        // From here, below is the code to add shortcut candidates.
        TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator();
            // Please note that the shortcut candidates will be added to the master queue only.
            TerminalAttributes::ShortcutIterator iterator =
                    terminalAttributes.getShortcutIterator();
            while (iterator.hasNextShortcutTarget()) {
            // TODO: addWord only supports weak ordering, meaning we have no means to control the
            // order of the shortcuts relative to one another or to the word. We need to either
            // modulate the frequency of each shortcut according to its own shortcut frequency or
            // to make the queue so that the insert order is protected inside the queue for words
                // TODO: addWord only supports weak ordering, meaning we have no means
                // to control the order of the shortcuts relative to one another or to the word.
                // We need to either modulate the frequency of each shortcut according
                // to its own shortcut frequency or to make the queue
                // so that the insert order is protected inside the queue for words
                // with the same score.
                uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
                const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
@@ -383,6 +373,18 @@ inline void UnigramDictionary::onTerminal(const int freq,
        }
    }

    // We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
    // or more length.
    if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
        // TODO: Check the validity of "inputIndex == wordLength"
        //if (addToSubQueue && inputIndex == wordLength) {
        WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex);
        const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength,
                inputIndex);
        addWord(wordPointer, wordLength, finalFreq, subQueue);
    }
}

void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
@@ -397,20 +399,57 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
    }
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int firstWordStartPos = 0;

    const int firstTypedWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    int firstFreq = getMostFrequentWordLike(0, firstTypedWordLength, proximityInfo, mWord);
    unsigned short* firstWord = 0;
    int firstWordLength = 0;
    if (firstFreq > 0) {
        firstWordLength = firstTypedWordLength;
        firstWord = mWord;
    } else {
        if (masterQueue->size() > 0) {
            double nsForMaster = masterQueue->getHighestNormalizedScore(
                    proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0);
            if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) {
                // Do nothing if the highest suggestion exceeds the threshold.
                return;
            }
        }
        WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstTypedWordLength);
        if (firstWordQueue->size() < 1) {
            return;
        }
        int score = 0;
        const double ns = firstWordQueue->getHighestNormalizedScore(
                proximityInfo->getPrimaryInputWord(), firstTypedWordLength, &firstWord, &score,
                &firstWordLength);
        // Two words correction won't be done if the score of the first word doesn't exceed the
        // threshold.
        if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) {
            return;
        }
        firstFreq = score >> (firstWordLength
                + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
    }

    if (firstFreq <= 0) {
        return;
    }

    const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
    const int secondWordLength = isSpaceProximity
            ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);

    if (inputLength >= MAX_WORD_LENGTH) return;

    if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
            || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
        return;

    const int newWordLength = firstWordLength + secondWordLength + 1;


    // Space proximity preparation
    //WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
    //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
@@ -420,15 +459,12 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo

    // Allocating variable length array on stack
    unsigned short word[newWordLength];
    const int firstFreq = getMostFrequentWordLike(
            firstWordStartPos, firstWordLength, proximityInfo, mWord);
    if (DEBUG_DICT) {
        AKLOGI("First freq: %d", firstFreq);
    }
    if (firstFreq <= 0) return;

    for (int i = 0; i < firstWordLength; ++i) {
        word[i] = mWord[i];
        word[i] = firstWord[i];
    }

    const int secondFreq = getMostFrequentWordLike(
+29 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ class WordsPriorityQueue {
        for (int i = 0; i < maxWordLength; ++i) {
            mSuggestedWords[i].mUsed = false;
        }
        mHighestSuggestedWord = 0;
    }

    ~WordsPriorityQueue() {
@@ -79,6 +80,9 @@ class WordsPriorityQueue {
            DUMP_WORD(word, wordLength);
        }
        mSuggestions.push(sw);
        if (!mHighestSuggestedWord || mHighestSuggestedWord->mScore < sw->mScore) {
            mHighestSuggestedWord = sw;
        }
    }

    SuggestedWord* top() {
@@ -88,6 +92,7 @@ class WordsPriorityQueue {
    }

    int outputSuggestions(int *frequencies, unsigned short *outputChars) {
        mHighestSuggestedWord = 0;
        const unsigned int size = min(MAX_WORDS, mSuggestions.size());
        int index = size - 1;
        while (!mSuggestions.empty() && index >= 0) {
@@ -116,6 +121,7 @@ class WordsPriorityQueue {
    }

    void clear() {
        mHighestSuggestedWord = 0;
        while (!mSuggestions.empty()) {
            SuggestedWord* sw = mSuggestions.top();
            if (DEBUG_WORDS_PRIORITY_QUEUE) {
@@ -134,6 +140,28 @@ class WordsPriorityQueue {
        DUMP_WORD(mSuggestions.top()->mWord, mSuggestions.top()->mWordLength);
    }

    double getHighestNormalizedScore(const unsigned short* before, const int beforeLength,
            unsigned short** outWord, int *outScore, int *outLength) {
        if (!mHighestSuggestedWord) {
            return 0.0;
        }
        SuggestedWord* sw = mHighestSuggestedWord;
        const int score = sw->mScore;
        unsigned short* word = sw->mWord;
        const int wordLength = sw->mWordLength;
        if (outScore) {
            *outScore = score;
        }
        if (outWord) {
            *outWord = word;
        }
        if (outLength) {
            *outLength = wordLength;
        }
        return Correction::RankingAlgorithm::calcNormalizedScore(
                before, beforeLength, word, wordLength, score);
    }

 private:
    struct wordComparator {
        bool operator ()(SuggestedWord * left, SuggestedWord * right) {
@@ -158,6 +186,7 @@ class WordsPriorityQueue {
    const unsigned int MAX_WORDS;
    const unsigned int MAX_WORD_LENGTH;
    SuggestedWord* mSuggestedWords;
    SuggestedWord* mHighestSuggestedWord;
};
}