Merge "Support multi words suggestion" (bb7a39b4) · Commits · e / os / android_packages_inputmethods_LatinIME

native/src/correction.cpp

+72 −32

Original line number	Diff line number	Diff line
		@@ -159,10 +159,10 @@ void Correction::checkState() {
		}
		}

		int Correction::getFreqForSplitTwoWords(const int freqArray, const int wordLengthArray,
		const bool isSpaceProximity, const unsigned short *word) {
		return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
		isSpaceProximity, word);
		int Correction::getFreqForSplitMultipleWords(const int freqArray, const int wordLengthArray,
		const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
		return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
		wordCount, this, isSpaceProximity, word);
		}

		int Correction::getFinalFreq(const int freq, unsigned short *word, int wordLength) {
		@@ -911,45 +911,85 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
		}

		/* static */
		int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
		const int freqArray, const int wordLengthArray, const Correction* correction,
		const bool isSpaceProximity, const unsigned short *word) {
		const int firstFreq = freqArray[0];
		const int secondFreq = freqArray[1];
		const int firstWordLength = wordLengthArray[0];
		const int secondWordLength = wordLengthArray[1];
		int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
		const int freqArray, const int wordLengthArray, const int wordCount,
		const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
		const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

		bool firstCapitalizedWordDemotion = false;
		bool secondCapitalizedWordDemotion = false;

		{
		// TODO: Handle multiple capitalized word demotion properly
		const int firstWordLength = wordLengthArray[0];
		const int secondWordLength = wordLengthArray[1];
		if (firstWordLength >= 2) {
		firstCapitalizedWordDemotion = isUpperCase(word[0]);
		}

		bool secondCapitalizedWordDemotion = false;
		if (secondWordLength >= 2) {
		// FIXME: word[firstWordLength + 1] is incorrect.
		secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
		}
		}


		const bool capitalizedWordDemotion =
		firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

		if (firstWordLength == 0 \|\| secondWordLength == 0) {
		int totalLength = 0;
		int totalFreq = 0;
		for (int i = 0; i < wordCount; ++i){
		const int wordLength = wordLengthArray[i];
		if (wordLength <= 0) {
		return 0;
		}
		const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
		int tempFirstFreq = firstFreq;
		multiplyRate(firstDemotionRate, &tempFirstFreq);

		const int secondDemotionRate = 100
		- TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
		int tempSecondFreq = secondFreq;
		multiplyRate(secondDemotionRate, &tempSecondFreq);
		totalLength += wordLength;
		const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
		int tempFirstFreq = freqArray[i];
		multiplyRate(demotionRate, &tempFirstFreq);
		totalFreq += tempFirstFreq;
		}

		const int totalLength = firstWordLength + secondWordLength;
		if (totalLength <= 0 \|\| totalFreq <= 0) {
		return 0;
		}

		// TODO: Currently totalFreq is adjusted to two word metrix.
		// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
		// length.
		int totalFreq = tempFirstFreq + tempSecondFreq;
		totalFreq = totalFreq * 2 / wordCount;
		if (wordCount > 2) {
		// Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
		int oneLengthCounter = 0;
		int twoLengthCounter = 0;
		for (int i = 0; i < wordCount; ++i) {
		const int wordLength = wordLengthArray[i];
		// TODO: Use bigram instead of this safety net
		if (i < wordCount - 1) {
		const int nextWordLength = wordLengthArray[i + 1];
		if (wordLength == 1 && nextWordLength == 2) {
		// Safety net to filter 1 length and 2 length sequential words
		return 0;
		}
		}
		const int freq = freqArray[i];
		// Demote too short weak words
		if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
		multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
		}
		if (wordLength == 1) {
		++oneLengthCounter;
		} else if (wordLength == 2) {
		++twoLengthCounter;
		}
		if (oneLengthCounter >= 2 \|\| (oneLengthCounter + twoLengthCounter) >= 4) {
		// Safety net to filter too many short words
		return 0;
		}
		}
		multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
		}

		// This is a workaround to try offsetting the not-enough-demotion which will be done in
		// calcNormalizedScore in Utils.java.
		@@ -993,9 +1033,9 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
		}

		if (DEBUG_CORRECTION_FREQ) {
		AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
		secondWordLength, capitalizedWordDemotion, totalFreq);
		DUMP_WORD(word, firstWordLength);
		AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
		wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
		DUMP_WORD(word, wordLengthArray[0]);
		}

		return totalFreq;

native/src/correction.h

+5 −5

Original line number	Diff line number	Diff line
		@@ -121,9 +121,9 @@ class Correction {

		bool needsToPrune() const;

		int getFreqForSplitTwoWords(
		const int freqArray, const int wordLengthArray, const bool isSpaceProximity,
		const unsigned short *word);
		int getFreqForSplitMultipleWords(
		const int freqArray, const int wordLengthArray, const int wordCount,
		const bool isSpaceProximity, const unsigned short *word);
		int getFinalFreq(const int freq, unsigned short *word, int wordLength);
		int getFinalFreqForSubQueue(const int freq, unsigned short *word, int wordLength,
		const int inputLength);
		@@ -151,8 +151,8 @@ class Correction {
		static int calculateFinalFreq(const int inputIndex, const int depth,
		const int freq, int editDistanceTable, const Correction correction,
		const int inputLength);
		static int calcFreqForSplitTwoWords(const int freqArray, const int wordLengthArray,
		const Correction* correction, const bool isSpaceProximity,
		static int calcFreqForSplitMultipleWords(const int freqArray, const int wordLengthArray,
		const int wordCount, const Correction* correction, const bool isSpaceProximity,
		const unsigned short *word);
		static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
		const unsigned short* after, const int afterLength, const int score);

native/src/defines.h

+5 −3

Original line number	Diff line number	Diff line
		@@ -208,6 +208,7 @@ static void prof_out(void) {
		#define ZERO_DISTANCE_PROMOTION_RATE 110
		#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
		#define HALF_SCORE_SQUARED_RADIUS 32.0f
		#define MAX_FREQ 255

		// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
		// This is only used for the size of array. Not to be used in c functions.
		@@ -222,7 +223,9 @@ static void prof_out(void) {
		#define SUB_QUEUE_MAX_WORDS 1
		#define SUB_QUEUE_MAX_COUNT 10
		#define SUB_QUEUE_MIN_WORD_LENGTH 4
		#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
		#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
		#define MULTIPLE_WORDS_DEMOTION_RATE 80
		#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6

		#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
		#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
		@@ -230,7 +233,6 @@ static void prof_out(void) {
		#define MAX_DEPTH_MULTIPLIER 3

		#define FIRST_WORD_INDEX 0
		#define SECOND_WORD_INDEX 1

		// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
		// word in the dictionary
		@@ -248,7 +250,7 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
		#define NEUTRAL_AREA_RADIUS_RATIO 1.3f

		// DEBUG
		#define INPUTLENGTH_FOR_DEBUG 10
		#define INPUTLENGTH_FOR_DEBUG -1
		#define MIN_OUTPUT_INDEX_FOR_DEBUG -1

		#endif // LATINIME_DEFINES_H

native/src/unigram_dictionary.cpp

+42 −23

Original line number	Diff line number	Diff line
		@@ -224,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
		// Multiple word suggestions
		if (SUGGEST_MULTIPLE_WORDS
		&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
		getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
		getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
		useFullEditDistance, inputLength, correction, queuePool,
		hasAutoCorrectionCandidate);
		}
		@@ -445,17 +445,18 @@ bool UnigramDictionary::getSubStringSuggestion(
		if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
		return false;
		}
		outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
		outputWord[tempOutputWordLength] = SPACE;
		if (outputWordLength) {
		++*outputWordLength;
		}
		} else if (currentWordIndex >= 1) {
		// TODO: Handle 3 or more words
		const int pairFreq = correction->getFreqForSplitTwoWords(
		freqArray, wordLengthArray, isSpaceProximity, outputWord);
		const int pairFreq = correction->getFreqForSplitMultipleWords(
		freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
		if (DEBUG_DICT) {
		AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
		inputLength, wordLengthArray[0]);
		DUMP_WORD(outputWord, tempOutputWordLength);
		AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
		inputLength, wordLengthArray[0], tempOutputWordLength);
		}
		addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
		}
		@@ -473,30 +474,46 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
		// Return if the last word index
		return;
		}
		for (int i = 1; i < inputLength; ++i) {
		int tempOutputWordLength = 0;
		// First word
		int inputWordStartPos = 0;
		int inputWordLength = i;
		if (startWordIndex >= 1
		&& (hasAutoCorrectionCandidate
		\|\| inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
		// Do not suggest 3+ words if already has auto correction candidate
		return;
		}
		for (int i = startInputPos + 1; i < inputLength; ++i) {
		if (DEBUG_CORRECTION_FREQ) {
		AKLOGI("Two words, %d", inputWordLength);
		AKLOGI("Multi words(%d), start in %d sep %d start out %d",
		startWordIndex, startInputPos, i, outputWordLength);
		DUMP_WORD(outputWord, outputWordLength);
		}
		int tempOutputWordLength = 0;
		// Current word
		int inputWordStartPos = startInputPos;
		int inputWordLength = i - startInputPos;
		if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
		useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
		FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
		freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
		startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
		true /* not used */, freqArray, wordLengthArray, outputWord,
		&tempOutputWordLength)) {
		continue;
		}

		// Second word
		if (DEBUG_CORRECTION_FREQ) {
		AKLOGI("Do missing space correction");
		}
		// Next word
		// Missing space
		inputWordStartPos = i;
		inputWordLength = inputLength - i;
		getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
		if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
		useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
		SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
		false /* missing space */, freqArray, wordLengthArray, outputWord,
		0);
		startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
		false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
		getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
		useFullEditDistance, inputLength, correction, queuePool,
		hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
		tempOutputWordLength, freqArray, wordLengthArray, outputWord);
		}

		// Mistyped space
		++inputWordStartPos;
		@@ -512,15 +529,17 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
		continue;
		}

		if (DEBUG_CORRECTION_FREQ) {
		AKLOGI("Do mistyped space correction");
		}
		getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
		useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
		SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
		true /* mistyped space */, freqArray, wordLengthArray, outputWord,
		0);
		startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
		true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
		}
		}

		void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
		void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength,
		Correction correction, WordsPriorityQueuePool queuePool,

native/src/unigram_dictionary.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -101,7 +101,7 @@ class UnigramDictionary {
		const bool useFullEditDistance, const int inputLength, Correction *correction,
		WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
		const int currentWordIndex);
		void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
		void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength,
		Correction correction, WordsPriorityQueuePool queuePool,