Search bigrams for the lower case version of the word (A46) (e9a86e2c) · Commits · e / os / android_packages_inputmethods_LatinIME

java/src/com/android/inputmethod/latin/Suggest.java

+0 −10

Original line number	Diff line number	Diff line
		@@ -177,19 +177,9 @@ public class Suggest {
		if (wordComposer.size() <= 1 && isCorrectionEnabled) {
		// At first character typed, search only the bigrams
		if (!TextUtils.isEmpty(prevWordForBigram)) {
		final CharSequence lowerPrevWord;
		if (StringUtils.hasUpperCase(prevWordForBigram)) {
		// TODO: Must pay attention to locale when changing case.
		lowerPrevWord = prevWordForBigram.toString().toLowerCase();
		} else {
		lowerPrevWord = null;
		}
		for (final String key : mDictionaries.keySet()) {
		final Dictionary dictionary = mDictionaries.get(key);
		suggestionsSet.addAll(dictionary.getBigrams(wordComposer, prevWordForBigram));
		if (null != lowerPrevWord) {
		suggestionsSet.addAll(dictionary.getBigrams(wordComposer, lowerPrevWord));
		}
		}
		}
		} else if (wordComposer.size() > 1) {

java/src/com/android/inputmethod/latin/UserHistoryDictionaryBigramList.java

+5 −5

Original line number	Diff line number	Diff line
		@@ -98,11 +98,11 @@ public class UserHistoryDictionaryBigramList {
		}

		public HashMap<String, Byte> getBigrams(String word1) {
		if (!mBigramMap.containsKey(word1)) {
		if (mBigramMap.containsKey(word1)) return mBigramMap.get(word1);
		// TODO: lower case according to locale
		final String lowerWord1 = word1.toLowerCase();
		if (mBigramMap.containsKey(lowerWord1)) return mBigramMap.get(lowerWord1);
		return EMPTY_BIGRAM_MAP;
		} else {
		return mBigramMap.get(word1);
		}
		}

		public boolean removeBigram(String word1, String word2) {

native/jni/src/bigram_dictionary.cpp

+21 −6

Original line number	Diff line number	Diff line
		@@ -105,8 +105,15 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
		// TODO: have "in" arguments before "out" ones, and make out args explicit in the name

		const uint8_t* const root = DICT;
		int pos = getBigramListPositionForWord(prevWord, prevWordLength);
		int pos = getBigramListPositionForWord(prevWord, prevWordLength,
		false /* forceLowerCaseSearch */);
		// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
		if (0 == pos) {
		// If no bigrams for this exact word, search again in lower case.
		pos = getBigramListPositionForWord(prevWord, prevWordLength,
		true /* forceLowerCaseSearch */);
		}
		// If still no bigrams, we really don't have them!
		if (0 == pos) return 0;
		int bigramFlags;
		int bigramCount = 0;
		@@ -141,10 +148,11 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
		// Returns a pointer to the start of the bigram list.
		// If the word is not found or has no bigrams, this function returns 0.
		int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
		const int prevWordLength) const {
		const int prevWordLength, const bool forceLowerCaseSearch) const {
		if (0 >= prevWordLength) return 0;
		const uint8_t* const root = DICT;
		int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength);
		int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
		forceLowerCaseSearch);

		if (NOT_VALID_WORD == pos) return 0;
		const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
		@@ -164,7 +172,13 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
		const int prevWordLength, std::map<int, int> map, uint8_t filter) const {
		memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
		const uint8_t* const root = DICT;
		int pos = getBigramListPositionForWord(prevWord, prevWordLength);
		int pos = getBigramListPositionForWord(prevWord, prevWordLength,
		false /* forceLowerCaseSearch */);
		if (0 == pos) {
		// If no bigrams for this exact string, search again in lower case.
		pos = getBigramListPositionForWord(prevWord, prevWordLength,
		true /* forceLowerCaseSearch */);
		}
		if (0 == pos) return;

		int bigramFlags;
		@@ -197,10 +211,11 @@ bool BigramDictionary::checkFirstCharacter(unsigned short word, int inputCodes
		bool BigramDictionary::isValidBigram(const int32_t word1, int length1, const int32_t word2,
		int length2) const {
		const uint8_t* const root = DICT;
		int pos = getBigramListPositionForWord(word1, length1);
		int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
		// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
		if (0 == pos) return false;
		int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2);
		int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
		false /* forceLowerCaseSearch */);
		if (NOT_VALID_WORD == nextWordPos) return false;
		int bigramFlags;
		do {

native/jni/src/bigram_dictionary.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -30,7 +30,8 @@ class BigramDictionary {
		BigramDictionary(const unsigned char *dict, int maxWordLength);
		int getBigrams(const int32_t word, int length, int inputCodes, int codesSize,
		unsigned short outWords, int frequencies, int maxWordLength, int maxBigrams) const;
		int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength) const;
		int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
		const bool forceLowerCaseSearch) const;
		void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
		std::map<int, int> map, uint8_t filter) const;
		bool isValidBigram(const int32_t word1, int length1, const int32_t word2, int length2) const;

native/jni/src/binary_format.h

+4 −3

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@

		#include <limits>
		#include "bloom_filter.h"
		#include "char_utils.h"
		#include "unigram_dictionary.h"

		namespace latinime {
		@@ -65,7 +66,7 @@ class BinaryFormat {
		static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
		int *pos);
		static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
		const int length);
		const int length, const bool forceLowerCaseSearch);
		static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
		uint16_t* outWord, int* outUnigramFrequency);
		static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
		@@ -309,7 +310,7 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
		// This function gets the byte position of the last chargroup of the exact matching word in the
		// dictionary. If no match is found, it returns NOT_VALID_WORD.
		inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
		const int32_t* const inWord, const int length) {
		const int32_t* const inWord, const int length, const bool forceLowerCaseSearch) {
		int pos = 0;
		int wordPos = 0;

		@@ -318,7 +319,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
		// there was no match (or we would have found it).
		if (wordPos > length) return NOT_VALID_WORD;
		int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
		const int32_t wChar = inWord[wordPos];
		const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
		while (true) {
		// If there are no more character groups in this node, it means we could not
		// find a matching character for this depth, therefore there is no match.