Merge "Two words error correction with other error correction for the first word" (2010130e) · Commits · e / os / android_packages_inputmethods_LatinIME

native/src/correction.cpp

+40 −27

Original line number	Diff line number	Diff line
		@@ -39,15 +39,15 @@ inline static void initEditDistance(int *editDistanceTable) {
		}
		}

		inline static void dumpEditDistance10ForDebug(int *editDistanceTable, const int inputLength,
		const int outputLength) {
		inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
		const int editDistanceTableWidth, const int outputLength) {
		if (DEBUG_DICT) {
		AKLOGI("EditDistanceTable");
		for (int i = 0; i <= 10; ++i) {
		int c[11];
		for (int j = 0; j <= 10; ++j) {
		if (j < inputLength + 1 && i < outputLength + 1) {
		c[j] = (editDistanceTable + i * (inputLength + 1))[j];
		if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
		c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
		} else {
		c[j] = -1;
		}
		@@ -81,12 +81,12 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
		}
		}

		inline static int getCurrentEditDistance(
		int *editDistanceTable, const int inputLength, const int outputLength) {
		inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
		const int outputLength, const int inputLength) {
		if (DEBUG_EDIT_DISTANCE) {
		AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
		}
		return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
		return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength];
		}

		//////////////////////
		@@ -165,6 +165,16 @@ int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFre
		}

		int Correction::getFinalFreq(const int freq, unsigned short *word, int wordLength) {
		return getFinalFreqInternal(freq, word, wordLength, mInputLength);
		}

		int Correction::getFinalFreqForSubQueue(const int freq, unsigned short *word, int wordLength,
		const int inputLength) {
		return getFinalFreqInternal(freq, word, wordLength, inputLength);
		}

		int Correction::getFinalFreqInternal(const int freq, unsigned short *word, int wordLength,
		const int inputLength) {
		const int outputIndex = mTerminalOutputIndex;
		const int inputIndex = mTerminalInputIndex;
		*wordLength = outputIndex + 1;
		@@ -173,8 +183,9 @@ int Correction::getFinalFreq(const int freq, unsigned short *word, int wordLen
		}

		*word = mWord;
		return Correction::RankingAlgorithm::calculateFinalFreq(
		inputIndex, outputIndex, freq, mEditDistanceTable, this);
		int finalFreq = Correction::RankingAlgorithm::calculateFinalFreq(
		inputIndex, outputIndex, freq, mEditDistanceTable, this, inputLength);
		return finalFreq;
		}

		bool Correction::initProcessState(const int outputIndex) {
		@@ -613,9 +624,9 @@ inline static bool isUpperCase(unsigned short c) {

		/* static */
		int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
		const int freq, int* editDistanceTable, const Correction* correction) {
		const int freq, int* editDistanceTable, const Correction* correction,
		const int inputLength) {
		const int excessivePos = correction->getExcessivePos();
		const int inputLength = correction->mInputLength;
		const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
		const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
		const ProximityInfo *proximityInfo = correction->mProximityInfo;
		@@ -640,13 +651,13 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
		const unsigned short* word = correction->mWord;
		const bool skipped = skippedCount > 0;

		const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
		const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
		- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));

		// TODO: Calculate edit distance for transposed and excessive
		int ed = 0;
		if (DEBUG_DICT_FULL) {
		dumpEditDistance10ForDebug(editDistanceTable, inputLength, outputIndex + 1);
		dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength);
		}
		int adjustedProximityMatchedCount = proximityMatchedCount;

		@@ -654,22 +665,22 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const

		// TODO: Optimize this.
		if (transposedCount > 0 \|\| proximityMatchedCount > 0 \|\| skipped \|\| excessiveCount > 0) {
		ed = getCurrentEditDistance(editDistanceTable, inputLength, outputIndex + 1)
		- transposedCount;
		ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength,
		inputLength) - transposedCount;

		const int matchWeight = powerIntCapped(typedLetterMultiplier,
		max(inputLength, outputIndex + 1) - ed);
		max(inputLength, outputLength) - ed);
		multiplyIntCapped(matchWeight, &finalFreq);

		// TODO: Demote further if there are two or more excessive chars with longer user input?
		if (inputLength > outputIndex + 1) {
		if (inputLength > outputLength) {
		multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
		}

		ed = max(0, ed - quoteDiffCount);

		if (transposedCount < 1) {
		if (ed == 1 && (inputLength == outputIndex \|\| inputLength == outputIndex + 2)) {
		if (ed == 1 && (inputLength == outputLength - 1 \|\| inputLength == outputLength + 1)) {
		// Promote a word with just one skipped or excessive char
		if (sameLength) {
		multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &finalFreq);
		@@ -681,7 +692,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
		sameLength = true;
		}
		}
		adjustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
		adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)),
		proximityMatchedCount);
		} else {
		const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
		@@ -783,7 +794,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
		// Promotion for an exactly matched word
		if (ed == 0) {
		// Full exact match
		if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0) {
		if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
		&& quoteDiffCount == 0) {
		finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
		}
		}
		@@ -828,14 +840,14 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
		}

		if (DEBUG_DICT_FULL) {
		AKLOGI("calc: %d, %d", outputIndex, sameLength);
		AKLOGI("calc: %d, %d", outputLength, sameLength);
		}

		if (DEBUG_CORRECTION_FREQ) {
		DUMP_WORD(correction->mWord, outputIndex + 1);
		AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d", proximityMatchedCount,
		skippedCount, transposedCount, excessiveCount, lastCharExceeded, sameLength,
		quoteDiffCount, ed, finalFreq);
		DUMP_WORD(correction->mWord, outputLength);
		AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
		skippedCount, transposedCount, excessiveCount, outputLength, lastCharExceeded,
		sameLength, quoteDiffCount, ed, finalFreq);
		}

		return finalFreq;
		@@ -881,11 +893,12 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
		if (firstWordLength == 0 \|\| secondWordLength == 0) {
		return 0;
		}
		const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
		const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
		int tempFirstFreq = firstFreq;
		multiplyRate(firstDemotionRate, &tempFirstFreq);

		const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
		const int secondDemotionRate = 100
		- TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
		int tempSecondFreq = secondFreq;
		multiplyRate(secondDemotionRate, &tempSecondFreq);

native/src/correction.h

+6 −1

Original line number	Diff line number	Diff line
		@@ -75,6 +75,8 @@ class Correction {
		int getFreqForSplitTwoWords(
		const int firstFreq, const int secondFreq, const unsigned short *word);
		int getFinalFreq(const int freq, unsigned short *word, int wordLength);
		int getFinalFreqForSubQueue(const int freq, unsigned short *word, int wordLength,
		const int inputLength);

		CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);

		@@ -97,7 +99,8 @@ class Correction {
		class RankingAlgorithm {
		public:
		static int calculateFinalFreq(const int inputIndex, const int depth,
		const int freq, int editDistanceTable, const Correction correction);
		const int freq, int editDistanceTable, const Correction correction,
		const int inputLength);
		static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
		const Correction* correction, const unsigned short *word);
		static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
		@@ -122,6 +125,8 @@ class Correction {
		const int32_t c, const bool isTerminal, const bool inputIndexIncremented);
		inline CorrectionType processUnrelatedCorrectionType();
		inline void addCharToCurrentWord(const int32_t c);
		inline int getFinalFreqInternal(const int freq, unsigned short *word, int wordLength,
		const int inputLength);

		const int TYPED_LETTER_MULTIPLIER;
		const int FULL_WORD_MULTIPLIER;

native/src/defines.h

+6 −2

Original line number	Diff line number	Diff line
		@@ -187,7 +187,7 @@ static void prof_out(void) {
		// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
		#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
		#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12
		#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 67
		#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58
		#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
		#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
		#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70
		@@ -199,6 +199,8 @@ static void prof_out(void) {
		#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
		#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
		#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
		#define TWO_WORDS_CORRECTION_DEMOTION_BASE 80
		#define TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER 1
		#define ZERO_DISTANCE_PROMOTION_RATE 110
		#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
		#define HALF_SCORE_SQUARED_RADIUS 32.0f
		@@ -212,8 +214,10 @@ static void prof_out(void) {
		// Holds up to 1 candidate for each word
		#define SUB_QUEUE_MAX_WORDS 1
		#define SUB_QUEUE_MAX_COUNT 10
		#define SUB_QUEUE_MIN_WORD_LENGTH 4

		#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f
		#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
		#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22

		#define MAX_DEPTH_MULTIPLIER 3

native/src/unigram_dictionary.cpp

+73 −37

Original line number	Diff line number	Diff line
		@@ -254,7 +254,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
		proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
		ns += 0;
		AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
		(ns > TWO_WORDS_CORRECTION_THRESHOLD));
		(ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD));
		DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
		DUMP_WORD(word, wordLength);
		}
		@@ -343,37 +343,27 @@ inline void UnigramDictionary::onTerminal(const int freq,
		WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) {
		const int inputIndex = correction->getInputIndex();
		const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
		if (!addToMasterQueue && !addToSubQueue) {
		return;
		}
		WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
		WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex);

		int wordLength;
		unsigned short* wordPointer;

		if (addToMasterQueue) {
		WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
		const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
		if (finalFreq != NOT_A_FREQUENCY) {
		if (!terminalAttributes.isShortcutOnly()) {
		if (addToMasterQueue) {
		addWord(wordPointer, wordLength, finalFreq, masterQueue);
		}
		// TODO: Check the validity of "inputIndex == wordLength"
		//if (addToSubQueue && inputIndex == wordLength) {
		if (addToSubQueue) {
		addWord(wordPointer, wordLength, finalFreq, subQueue);
		}
		}
		// Please note that the shortcut candidates will be added to the master queue only.
		if (!addToMasterQueue) {
		return;
		}

		// From here, below is the code to add shortcut candidates.
		TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator();
		// Please note that the shortcut candidates will be added to the master queue only.
		TerminalAttributes::ShortcutIterator iterator =
		terminalAttributes.getShortcutIterator();
		while (iterator.hasNextShortcutTarget()) {
		// TODO: addWord only supports weak ordering, meaning we have no means to control the
		// order of the shortcuts relative to one another or to the word. We need to either
		// modulate the frequency of each shortcut according to its own shortcut frequency or
		// to make the queue so that the insert order is protected inside the queue for words
		// TODO: addWord only supports weak ordering, meaning we have no means
		// to control the order of the shortcuts relative to one another or to the word.
		// We need to either modulate the frequency of each shortcut according
		// to its own shortcut frequency or to make the queue
		// so that the insert order is protected inside the queue for words
		// with the same score.
		uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
		const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
		@@ -383,6 +373,18 @@ inline void UnigramDictionary::onTerminal(const int freq,
		}
		}

		// We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
		// or more length.
		if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
		// TODO: Check the validity of "inputIndex == wordLength"
		//if (addToSubQueue && inputIndex == wordLength) {
		WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex);
		const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength,
		inputIndex);
		addWord(wordPointer, wordLength, finalFreq, subQueue);
		}
		}

		void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
		@@ -397,20 +399,57 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
		}
		const bool isSpaceProximity = spaceProximityPos >= 0;
		const int firstWordStartPos = 0;

		const int firstTypedWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
		int firstFreq = getMostFrequentWordLike(0, firstTypedWordLength, proximityInfo, mWord);
		unsigned short* firstWord = 0;
		int firstWordLength = 0;
		if (firstFreq > 0) {
		firstWordLength = firstTypedWordLength;
		firstWord = mWord;
		} else {
		if (masterQueue->size() > 0) {
		double nsForMaster = masterQueue->getHighestNormalizedScore(
		proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0);
		if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) {
		// Do nothing if the highest suggestion exceeds the threshold.
		return;
		}
		}
		WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstTypedWordLength);
		if (firstWordQueue->size() < 1) {
		return;
		}
		int score = 0;
		const double ns = firstWordQueue->getHighestNormalizedScore(
		proximityInfo->getPrimaryInputWord(), firstTypedWordLength, &firstWord, &score,
		&firstWordLength);
		// Two words correction won't be done if the score of the first word doesn't exceed the
		// threshold.
		if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) {
		return;
		}
		firstFreq = score >> (firstWordLength
		+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
		}

		if (firstFreq <= 0) {
		return;
		}

		const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
		const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
		const int secondWordLength = isSpaceProximity
		? (inputLength - spaceProximityPos - 1)
		: (inputLength - missingSpacePos);

		if (inputLength >= MAX_WORD_LENGTH) return;

		if (0 >= firstWordLength \|\| 0 >= secondWordLength \|\| firstWordStartPos >= secondWordStartPos
		\|\| firstWordStartPos < 0 \|\| secondWordStartPos + secondWordLength > inputLength)
		return;

		const int newWordLength = firstWordLength + secondWordLength + 1;


		// Space proximity preparation
		//WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
		//initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
		@@ -420,15 +459,12 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo

		// Allocating variable length array on stack
		unsigned short word[newWordLength];
		const int firstFreq = getMostFrequentWordLike(
		firstWordStartPos, firstWordLength, proximityInfo, mWord);
		if (DEBUG_DICT) {
		AKLOGI("First freq: %d", firstFreq);
		}
		if (firstFreq <= 0) return;

		for (int i = 0; i < firstWordLength; ++i) {
		word[i] = mWord[i];
		word[i] = firstWord[i];
		}

		const int secondFreq = getMostFrequentWordLike(

native/src/words_priority_queue.h

+29 −0

Original line number	Diff line number	Diff line
		@@ -47,6 +47,7 @@ class WordsPriorityQueue {
		for (int i = 0; i < maxWordLength; ++i) {
		mSuggestedWords[i].mUsed = false;
		}
		mHighestSuggestedWord = 0;
		}

		~WordsPriorityQueue() {
		@@ -79,6 +80,9 @@ class WordsPriorityQueue {
		DUMP_WORD(word, wordLength);
		}
		mSuggestions.push(sw);
		if (!mHighestSuggestedWord \|\| mHighestSuggestedWord->mScore < sw->mScore) {
		mHighestSuggestedWord = sw;
		}
		}

		SuggestedWord* top() {
		@@ -88,6 +92,7 @@ class WordsPriorityQueue {
		}

		int outputSuggestions(int frequencies, unsigned short outputChars) {
		mHighestSuggestedWord = 0;
		const unsigned int size = min(MAX_WORDS, mSuggestions.size());
		int index = size - 1;
		while (!mSuggestions.empty() && index >= 0) {
		@@ -116,6 +121,7 @@ class WordsPriorityQueue {
		}

		void clear() {
		mHighestSuggestedWord = 0;
		while (!mSuggestions.empty()) {
		SuggestedWord* sw = mSuggestions.top();
		if (DEBUG_WORDS_PRIORITY_QUEUE) {
		@@ -134,6 +140,28 @@ class WordsPriorityQueue {
		DUMP_WORD(mSuggestions.top()->mWord, mSuggestions.top()->mWordLength);
		}

		double getHighestNormalizedScore(const unsigned short* before, const int beforeLength,
		unsigned short** outWord, int outScore, int outLength) {
		if (!mHighestSuggestedWord) {
		return 0.0;
		}
		SuggestedWord* sw = mHighestSuggestedWord;
		const int score = sw->mScore;
		unsigned short* word = sw->mWord;
		const int wordLength = sw->mWordLength;
		if (outScore) {
		*outScore = score;
		}
		if (outWord) {
		*outWord = word;
		}
		if (outLength) {
		*outLength = wordLength;
		}
		return Correction::RankingAlgorithm::calcNormalizedScore(
		before, beforeLength, word, wordLength, score);
		}

		private:
		struct wordComparator {
		bool operator ()(SuggestedWord * left, SuggestedWord * right) {
		@@ -158,6 +186,7 @@ class WordsPriorityQueue {
		const unsigned int MAX_WORDS;
		const unsigned int MAX_WORD_LENGTH;
		SuggestedWord* mSuggestedWords;
		SuggestedWord* mHighestSuggestedWord;
		};
		}