Merge "Prepair for advanced two words error correction" (ff020671) · Commits · e / os / android_packages_inputmethods_LatinIME

native/src/correction.cpp

+95 −1

Original line number	Diff line number	Diff line
		@@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne

		inline static int getCurrentEditDistance(
		int *editDistanceTable, const int inputLength, const int outputLength) {
		if (DEBUG_DICT) {
		if (DEBUG_EDIT_DISTANCE) {
		AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
		}
		return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
		@@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
		return totalFreq;
		}

		/* static */
		int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
		const int firstFreq, const int secondFreq, const Correction* correction,
		const unsigned short *word) {
		const int spaceProximityPos = correction->mSpaceProximityPos;
		const int missingSpacePos = correction->mMissingSpacePos;
		if (DEBUG_DICT) {
		int inputCount = 0;
		if (spaceProximityPos >= 0) ++inputCount;
		if (missingSpacePos >= 0) ++inputCount;
		assert(inputCount <= 1);
		}
		const bool isSpaceProximity = spaceProximityPos >= 0;
		const int inputLength = correction->mInputLength;
		const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
		const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
		: (inputLength - missingSpacePos);
		const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

		bool firstCapitalizedWordDemotion = false;
		if (firstWordLength >= 2) {
		firstCapitalizedWordDemotion = isUpperCase(word[0]);
		}

		bool secondCapitalizedWordDemotion = false;
		if (secondWordLength >= 2) {
		secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
		}

		const bool capitalizedWordDemotion =
		firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

		if (DEBUG_DICT_FULL) {
		AKLOGI("Two words: %c, %c, %d",
		word[0], word[firstWordLength + 1], capitalizedWordDemotion);
		}

		if (firstWordLength == 0 \|\| secondWordLength == 0) {
		return 0;
		}
		const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
		int tempFirstFreq = firstFreq;
		multiplyRate(firstDemotionRate, &tempFirstFreq);

		const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
		int tempSecondFreq = secondFreq;
		multiplyRate(secondDemotionRate, &tempSecondFreq);

		const int totalLength = firstWordLength + secondWordLength;

		// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
		// length.
		int totalFreq = tempFirstFreq + tempSecondFreq;

		// This is a workaround to try offsetting the not-enough-demotion which will be done in
		// calcNormalizedScore in Utils.java.
		// In calcNormalizedScore the score will be demoted by (1 - 1 / length)
		// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
		// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
		const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
		multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);

		// At this moment, totalFreq is calculated by the following formula:
		// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
		// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))

		multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);

		// This is another workaround to offset the demotion which will be done in
		// calcNormalizedScore in Utils.java.
		// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
		// the same amount because we already have adjusted the synthetic freq of this "missing or
		// mistyped space" suggestion candidate above in this method.
		const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
		multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);

		if (isSpaceProximity) {
		// A word pair with one space proximity correction
		if (DEBUG_DICT) {
		AKLOGI("Found a word pair with space proximity correction.");
		}
		multiplyIntCapped(typedLetterMultiplier, &totalFreq);
		multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
		}

		multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);

		if (capitalizedWordDemotion) {
		multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
		}

		return totalFreq;
		}

		/* Damerau-Levenshtein distance */
		inline static int editDistanceInternal(
		int* editDistanceTable, const unsigned short* before,

native/src/correction.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -100,6 +100,8 @@ class Correction {
		const int freq, int editDistanceTable, const Correction correction);
		static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
		const Correction* correction, const unsigned short *word);
		static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
		const Correction* correction, const unsigned short *word);
		static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
		const unsigned short* after, const int afterLength, const int score);
		static int editDistance(const unsigned short* before,

native/src/defines.h

+4 −2

Original line number	Diff line number	Diff line
		@@ -117,8 +117,8 @@ static void prof_out(void) {
		#define DEBUG_TRACE DEBUG_DICT_FULL
		#define DEBUG_PROXIMITY_INFO false
		#define DEBUG_CORRECTION false
		#define DEBUG_CORRECTION_FREQ true
		#define DEBUG_WORDS_PRIORITY_QUEUE true
		#define DEBUG_CORRECTION_FREQ false
		#define DEBUG_WORDS_PRIORITY_QUEUE false

		#else // FLAG_DBG

		@@ -213,6 +213,8 @@ static void prof_out(void) {
		#define SUB_QUEUE_MAX_WORDS 1
		#define SUB_QUEUE_MAX_COUNT 10

		#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f

		#define MAX_DEPTH_MULTIPLIER 3

		// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German

native/src/unigram_dictionary.cpp

+91 −1

Original line number	Diff line number	Diff line
		@@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
		}
		}
		PROF_END(6);
		if (DEBUG_WORDS_PRIORITY_QUEUE) {
		if (DEBUG_DICT) {
		queuePool->dumpSubQueue1TopSuggestions();
		for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
		WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
		if (queue->size() > 0) {
		WordsPriorityQueue::SuggestedWord* sw = queue->top();
		const int score = sw->mScore;
		const unsigned short* word = sw->mWord;
		const int wordLength = sw->mWordLength;
		double ns = Correction::RankingAlgorithm::calcNormalizedScore(
		proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
		ns += 0;
		AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
		(ns > TWO_WORDS_CORRECTION_THRESHOLD));
		DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
		DUMP_WORD(word, wordLength);
		}
		}
		}
		}

		@@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
		return;
		}

		void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
		const int spaceProximityPos, Correction correction, WordsPriorityQueuePool queuePool) {
		WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();

		if (DEBUG_DICT) {
		int inputCount = 0;
		if (spaceProximityPos >= 0) ++inputCount;
		if (missingSpacePos >= 0) ++inputCount;
		assert(inputCount <= 1);
		}
		const bool isSpaceProximity = spaceProximityPos >= 0;
		const int firstWordStartPos = 0;
		const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
		const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
		const int secondWordLength = isSpaceProximity
		? (inputLength - spaceProximityPos - 1)
		: (inputLength - missingSpacePos);

		if (inputLength >= MAX_WORD_LENGTH) return;
		if (0 >= firstWordLength \|\| 0 >= secondWordLength \|\| firstWordStartPos >= secondWordStartPos
		\|\| firstWordStartPos < 0 \|\| secondWordStartPos + secondWordLength > inputLength)
		return;

		const int newWordLength = firstWordLength + secondWordLength + 1;


		// Space proximity preparation
		//WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
		//initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
		//correction);
		//getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
		//MAX_ERRORS_FOR_TWO_WORDS);

		// Allocating variable length array on stack
		unsigned short word[newWordLength];
		const int firstFreq = getMostFrequentWordLike(
		firstWordStartPos, firstWordLength, proximityInfo, mWord);
		if (DEBUG_DICT) {
		AKLOGI("First freq: %d", firstFreq);
		}
		if (firstFreq <= 0) return;

		for (int i = 0; i < firstWordLength; ++i) {
		word[i] = mWord[i];
		}

		const int secondFreq = getMostFrequentWordLike(
		secondWordStartPos, secondWordLength, proximityInfo, mWord);
		if (DEBUG_DICT) {
		AKLOGI("Second freq: %d", secondFreq);
		}
		if (secondFreq <= 0) return;

		word[firstWordLength] = SPACE;
		for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
		word[i] = mWord[i - firstWordLength - 1];
		}

		// TODO: Remove initSuggestions and correction->setCorrectionParams
		initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);

		correction->setCorrectionParams(-1 /* skipPos /, -1 / excessivePos */,
		-1 /* transposedPos */, spaceProximityPos, missingSpacePos,
		useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
		const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
		if (DEBUG_DICT) {
		AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
		}
		addWord(word, newWordLength, pairFreq, masterQueue);
		return;
		}

		// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
		// interface.
		inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,

native/src/unigram_dictionary.h

+4 −0

Original line number	Diff line number	Diff line
		@@ -104,6 +104,10 @@ class UnigramDictionary {
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
		const int missingSpacePos, Correction correction, WordsPriorityQueuePool queuePool);
		void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
		const int xcoordinates, const int ycoordinates, const int *codes,
		const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
		const int missingSpacePos, Correction correction, WordsPriorityQueuePool queuePool);
		void getMissingSpaceWords(ProximityInfo proximityInfo, const int xcoordinates,
		const int ycoordinates, const int codes, const bool useFullEditDistance,
		const int inputLength, const int missingSpacePos, Correction *correction,