Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bb7a39b4 authored by satok's avatar satok Committed by Android (Google) Code Review
Browse files

Merge "Support multi words suggestion"

parents 8174373a a85f4929
Loading
Loading
Loading
Loading
+72 −32
Original line number Diff line number Diff line
@@ -159,10 +159,10 @@ void Correction::checkState() {
    }
}

int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
        const bool isSpaceProximity, const unsigned short *word) {
    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
            isSpaceProximity, word);
int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
        const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
    return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
            wordCount, this, isSpaceProximity, word);
}

int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@@ -911,45 +911,85 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
}

/* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
        const int *freqArray, const int *wordLengthArray, const Correction* correction,
        const bool isSpaceProximity, const unsigned short *word) {
    const int firstFreq = freqArray[0];
    const int secondFreq = freqArray[1];
    const int firstWordLength = wordLengthArray[0];
    const int secondWordLength = wordLengthArray[1];
int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
        const int *freqArray, const int *wordLengthArray, const int wordCount,
        const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

    bool firstCapitalizedWordDemotion = false;
    bool secondCapitalizedWordDemotion = false;

    {
        // TODO: Handle multiple capitalized word demotion properly
        const int firstWordLength = wordLengthArray[0];
        const int secondWordLength = wordLengthArray[1];
        if (firstWordLength >= 2) {
            firstCapitalizedWordDemotion = isUpperCase(word[0]);
        }

    bool secondCapitalizedWordDemotion = false;
        if (secondWordLength >= 2) {
            // FIXME: word[firstWordLength + 1] is incorrect.
            secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
        }
    }


    const bool capitalizedWordDemotion =
            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

    if (firstWordLength == 0 || secondWordLength == 0) {
    int totalLength = 0;
    int totalFreq = 0;
    for (int i = 0; i < wordCount; ++i){
        const int wordLength = wordLengthArray[i];
        if (wordLength <= 0) {
            return 0;
        }
    const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
    int tempFirstFreq = firstFreq;
    multiplyRate(firstDemotionRate, &tempFirstFreq);

    const int secondDemotionRate = 100
            - TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
    int tempSecondFreq = secondFreq;
    multiplyRate(secondDemotionRate, &tempSecondFreq);
        totalLength += wordLength;
        const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
        int tempFirstFreq = freqArray[i];
        multiplyRate(demotionRate, &tempFirstFreq);
        totalFreq += tempFirstFreq;
    }

    const int totalLength = firstWordLength + secondWordLength;
    if (totalLength <= 0 || totalFreq <= 0) {
        return 0;
    }

    // TODO: Currently totalFreq is adjusted to two word metrix.
    // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
    // length.
    int totalFreq = tempFirstFreq + tempSecondFreq;
    totalFreq = totalFreq * 2 / wordCount;
    if (wordCount > 2) {
        // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
        int oneLengthCounter = 0;
        int twoLengthCounter = 0;
        for (int i = 0; i < wordCount; ++i) {
            const int wordLength = wordLengthArray[i];
            // TODO: Use bigram instead of this safety net
            if (i < wordCount - 1) {
                const int nextWordLength = wordLengthArray[i + 1];
                if (wordLength == 1 && nextWordLength == 2) {
                    // Safety net to filter 1 length and 2 length sequential words
                    return 0;
                }
            }
            const int freq = freqArray[i];
            // Demote too short weak words
            if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
                multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
            }
            if (wordLength == 1) {
                ++oneLengthCounter;
            } else if (wordLength == 2) {
                ++twoLengthCounter;
            }
            if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
                // Safety net to filter too many short words
                return 0;
            }
        }
        multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
    }

    // This is a workaround to try offsetting the not-enough-demotion which will be done in
    // calcNormalizedScore in Utils.java.
@@ -993,9 +1033,9 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    }

    if (DEBUG_CORRECTION_FREQ) {
        AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
                secondWordLength, capitalizedWordDemotion, totalFreq);
        DUMP_WORD(word, firstWordLength);
        AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
                wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
        DUMP_WORD(word, wordLengthArray[0]);
    }

    return totalFreq;
+5 −5
Original line number Diff line number Diff line
@@ -121,9 +121,9 @@ class Correction {

    bool needsToPrune() const;

    int getFreqForSplitTwoWords(
            const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
            const unsigned short *word);
    int getFreqForSplitMultipleWords(
            const int *freqArray, const int *wordLengthArray, const int wordCount,
            const bool isSpaceProximity, const unsigned short *word);
    int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
    int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
            const int inputLength);
@@ -151,8 +151,8 @@ class Correction {
        static int calculateFinalFreq(const int inputIndex, const int depth,
                const int freq, int *editDistanceTable, const Correction* correction,
                const int inputLength);
        static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
                const Correction* correction, const bool isSpaceProximity,
        static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
                const int wordCount, const Correction* correction, const bool isSpaceProximity,
                const unsigned short *word);
        static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
                const unsigned short* after, const int afterLength, const int score);
+5 −3
Original line number Diff line number Diff line
@@ -208,6 +208,7 @@ static void prof_out(void) {
#define ZERO_DISTANCE_PROMOTION_RATE 110
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
#define MAX_FREQ 255

// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
@@ -222,7 +223,9 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10
#define SUB_QUEUE_MIN_WORD_LENGTH 4
#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
#define MULTIPLE_WORDS_DEMOTION_RATE 80
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6

#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
@@ -230,7 +233,6 @@ static void prof_out(void) {
#define MAX_DEPTH_MULTIPLIER 3

#define FIRST_WORD_INDEX 0
#define SECOND_WORD_INDEX 1

// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
// word in the dictionary
@@ -248,7 +250,7 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
#define NEUTRAL_AREA_RADIUS_RATIO 1.3f

// DEBUG
#define INPUTLENGTH_FOR_DEBUG 10
#define INPUTLENGTH_FOR_DEBUG -1
#define MIN_OUTPUT_INDEX_FOR_DEBUG -1

#endif // LATINIME_DEFINES_H
+42 −23
Original line number Diff line number Diff line
@@ -224,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
    // Multiple word suggestions
    if (SUGGEST_MULTIPLE_WORDS
            && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
        getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
        getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, inputLength, correction, queuePool,
                hasAutoCorrectionCandidate);
    }
@@ -445,17 +445,18 @@ bool UnigramDictionary::getSubStringSuggestion(
        if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
            return false;
        }
        outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
        outputWord[tempOutputWordLength] = SPACE;
        if (outputWordLength) {
            ++*outputWordLength;
        }
    } else if (currentWordIndex >= 1) {
        // TODO: Handle 3 or more words
        const int pairFreq = correction->getFreqForSplitTwoWords(
                freqArray, wordLengthArray, isSpaceProximity, outputWord);
        const int pairFreq = correction->getFreqForSplitMultipleWords(
                freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
        if (DEBUG_DICT) {
            AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
                    inputLength, wordLengthArray[0]);
            DUMP_WORD(outputWord, tempOutputWordLength);
            AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
                    inputLength, wordLengthArray[0], tempOutputWordLength);
        }
        addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
    }
@@ -473,30 +474,46 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
        // Return if the last word index
        return;
    }
    for (int i = 1; i < inputLength; ++i) {
        int tempOutputWordLength = 0;
        // First word
        int inputWordStartPos = 0;
        int inputWordLength = i;
    if (startWordIndex >= 1
            && (hasAutoCorrectionCandidate
                    || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
        // Do not suggest 3+ words if already has auto correction candidate
        return;
    }
    for (int i = startInputPos + 1; i < inputLength; ++i) {
        if (DEBUG_CORRECTION_FREQ) {
            AKLOGI("Two words, %d", inputWordLength);
            AKLOGI("Multi words(%d), start in %d sep %d start out %d",
                    startWordIndex, startInputPos, i, outputWordLength);
            DUMP_WORD(outputWord, outputWordLength);
        }
        int tempOutputWordLength = 0;
        // Current word
        int inputWordStartPos = startInputPos;
        int inputWordLength = i - startInputPos;
        if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
                FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
                freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
                startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
                true /* not used */, freqArray, wordLengthArray, outputWord,
                &tempOutputWordLength)) {
            continue;
        }

        // Second word
        if (DEBUG_CORRECTION_FREQ) {
            AKLOGI("Do missing space correction");
        }
        // Next word
        // Missing space
        inputWordStartPos = i;
        inputWordLength = inputLength - i;
        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
        if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
                false /* missing space */, freqArray, wordLengthArray, outputWord,
                0);
                startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
                false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
            getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
                    useFullEditDistance, inputLength, correction, queuePool,
                    hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
                    tempOutputWordLength, freqArray, wordLengthArray, outputWord);
        }

        // Mistyped space
        ++inputWordStartPos;
@@ -512,15 +529,17 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
            continue;
        }

        if (DEBUG_CORRECTION_FREQ) {
            AKLOGI("Do mistyped space correction");
        }
        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
                true /* mistyped space */, freqArray, wordLengthArray, outputWord,
                0);
                startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
                true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
    }
}

void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const bool useFullEditDistance, const int inputLength,
        Correction *correction, WordsPriorityQueuePool* queuePool,
+1 −1
Original line number Diff line number Diff line
@@ -101,7 +101,7 @@ class UnigramDictionary {
            const bool useFullEditDistance, const int inputLength, Correction *correction,
            WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
            const int currentWordIndex);
    void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
    void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength,
            Correction *correction, WordsPriorityQueuePool* queuePool,