Loading java/src/com/android/inputmethod/latin/BinaryDictionary.java +2 −2 Original line number Diff line number Diff line Loading @@ -91,7 +91,7 @@ public final class BinaryDictionary extends Dictionary { private static native long openNative(String sourceDir, long dictOffset, long dictSize); private static native void closeNative(long dict); private static native int getFrequencyNative(long dict, int[] word); private static native int getProbabilityNative(long dict, int[] word); private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2); private static native int getSuggestionsNative(long dict, long proximityInfo, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, Loading Loading @@ -186,7 +186,7 @@ public final class BinaryDictionary extends Dictionary { public int getFrequency(final String word) { if (word == null) return -1; int[] codePoints = StringUtils.toCodePointArray(word); return getFrequencyNative(mNativeDict, codePoints); return getProbabilityNative(mNativeDict, codePoints); } // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni Loading native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +4 −4 Original line number Diff line number Diff line Loading @@ -203,14 +203,14 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j return count; } static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jclass clazz, jlong dict, static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict, jintArray wordArray) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return 0; const jsize codePointLength = env->GetArrayLength(wordArray); int codePoints[codePointLength]; env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints); return dictionary->getFrequency(codePoints, codePointLength); return dictionary->getProbability(codePoints, codePointLength); } static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict, Loading Loading @@ -285,8 +285,8 @@ static JNINativeMethod sMethods[] = { {"closeNative", "(J)V", reinterpret_cast<void *>(latinime_BinaryDictionary_close)}, {"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)}, {"getFrequencyNative", "(J[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getFrequency)}, {"getProbabilityNative", "(J[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getProbability)}, {"isValidBigramNative", "(J[I[I)Z", reinterpret_cast<void *>(latinime_BinaryDictionary_isValidBigram)}, {"calcNormalizedScoreNative", "([I[II)F", Loading native/jni/src/bigram_dictionary.cpp +23 −22 Original line number Diff line number Diff line Loading @@ -36,21 +36,21 @@ BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT BigramDictionary::~BigramDictionary() { } void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq, void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); #endif } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_RESULTS) { if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability && length < getCodePointCount(MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { break; Loading @@ -63,10 +63,10 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * if (insertAt >= MAX_RESULTS) { return; } memmove(bigramFreq + (insertAt + 1), bigramFreq + insertAt, (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0])); bigramFreq[insertAt] = frequency; memmove(bigramProbability + (insertAt + 1), bigramProbability + insertAt, (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); bigramProbability[insertAt] = probability; outputTypes[insertAt] = Dictionary::KIND_PREDICTION; memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH, Loading @@ -87,7 +87,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. * inputSize: the size of the codes array. * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. * bigramFreq: an array to output frequencies. * bigramProbability: an array to output frequencies. * outputTypes: an array to output types. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of Loading @@ -98,7 +98,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * reduce their scope to the ones that match the first letter. */ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const { int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name Loading @@ -118,23 +118,24 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); int bigramBuffer[MAX_WORD_LENGTH]; int unigramFreq = 0; int unigramProbability = 0; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, bigramBuffer, &unigramFreq); bigramBuffer, &unigramProbability); // inputSize == 0 means we are trying to find bigram predictions. if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; // Due to space constraints, the frequency for bigrams is approximate - the lower the // unigram frequency, the worse the precision. The theoritical maximum error in // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 const int bigramProbabilityTemp = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; // Due to space constraints, the probability for bigrams is approximate - the lower the // unigram probability, the worse the precision. The theoritical maximum error in // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int frequency = BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints, const int probability = BinaryFormat::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); ++bigramCount; } Loading @@ -159,13 +160,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t *const root = DICT_ROOT; Loading @@ -181,10 +182,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW uint8_t bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; (*map)[bigramPos] = probability; setInFilter(filter, bigramPos); } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } Loading native/jni/src/bigram_dictionary.h +3 −3 Original line number Diff line number Diff line Loading @@ -29,14 +29,14 @@ class BigramDictionary { BigramDictionary(const uint8_t *const streamStart); int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength, void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); void addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints, int *outputTypes) const; void addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; Loading native/jni/src/binary_format.h +44 −40 Original line number Diff line number Diff line Loading @@ -52,10 +52,10 @@ class BinaryFormat { // Flag for sign of offset. If this flag is set, the offset value must be negated. static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; // Mask for attribute frequency, stored on 4 bits inside the flags byte. static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; // The numeric value of the shortcut frequency that means 'whitelist'. static const int WHITELIST_SHORTCUT_FREQUENCY = 15; // Mask for attribute probability, stored on 4 bits inside the flags byte. static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F; // The numeric value of the shortcut probability that means 'whitelist'. static const int WHITELIST_SHORTCUT_PROBABILITY = 15; // Mask and flags for attribute address type selection. static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; Loading @@ -72,10 +72,10 @@ class BinaryFormat { static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos); static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); static int skipProbability(const uint8_t flags, const int pos); static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); Loading @@ -83,14 +83,15 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); static int getAttributeFrequencyFromFlags(const int flags); static int getAttributeProbabilityFromFlags(const int flags); static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); int *outWord, int *outUnigramProbability); static int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); const uint8_t *bigramFilter, const int unigramProbability); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or Loading Loading @@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c } } inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } Loading Loading @@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos return pos + childrenAddressSize(flags); } inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) { return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } Loading Loading @@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin } } inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { return flags & MASK_ATTRIBUTE_FREQUENCY; inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) { return flags & MASK_ATTRIBUTE_PROBABILITY; } // This function gets the byte position of the last chargroup of the exact matching word in the Loading Loading @@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (wordPos == length) { return charGroupPos; } pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); } if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; Loading @@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); } --charGroupCount; Loading @@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. * outUnigramFrequency: a pointer to an int to write the frequency into. * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the length of the word, of 0 if the word was not found. */ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramFrequency) { const int maxDepth, int *outWord, int *outUnigramProbability) { int pos = 0; int wordPos = 0; Loading Loading @@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co nextChar = getCodePointAndForwardPointer(root, &pos); } } *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the // first and possibly the frequency. // first and possibly the probability. if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } pos = skipFrequency(flags, pos); pos = skipProbability(flags, pos); // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant Loading Loading @@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co } } ++wordPos; // Now we only need to branch to the children address. Skip the frequency if // Now we only need to branch to the children address. Skip the probability if // it's there, read pos, and break to resume the search at pos. lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos); pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); break; } else { Loading Loading @@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } static inline int backoff(const int unigramFreq) { return unigramFreq; static inline int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the // unigram frequency to be the median value of the 17th step from the top. A value of // 0 for the bigram frequency represents the middle of the 16th step from the top, inline int BinaryFormat::computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the // unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize); const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { if (!bigramMap || !bigramFilter) return backoff(unigramFreq); if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); if (bigramFreqIt != bigramMap->end()) { const int bigramFreq = bigramFreqIt->second; return computeFrequencyForBigram(unigramFreq, bigramFreq); } return backoff(unigramFreq); const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) return backoff(unigramProbability); if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H Loading
java/src/com/android/inputmethod/latin/BinaryDictionary.java +2 −2 Original line number Diff line number Diff line Loading @@ -91,7 +91,7 @@ public final class BinaryDictionary extends Dictionary { private static native long openNative(String sourceDir, long dictOffset, long dictSize); private static native void closeNative(long dict); private static native int getFrequencyNative(long dict, int[] word); private static native int getProbabilityNative(long dict, int[] word); private static native boolean isValidBigramNative(long dict, int[] word1, int[] word2); private static native int getSuggestionsNative(long dict, long proximityInfo, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, Loading Loading @@ -186,7 +186,7 @@ public final class BinaryDictionary extends Dictionary { public int getFrequency(final String word) { if (word == null) return -1; int[] codePoints = StringUtils.toCodePointArray(word); return getFrequencyNative(mNativeDict, codePoints); return getProbabilityNative(mNativeDict, codePoints); } // TODO: Add a batch process version (isValidBigramMultiple?) to avoid excessive numbers of jni Loading
native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +4 −4 Original line number Diff line number Diff line Loading @@ -203,14 +203,14 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j return count; } static jint latinime_BinaryDictionary_getFrequency(JNIEnv *env, jclass clazz, jlong dict, static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict, jintArray wordArray) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return 0; const jsize codePointLength = env->GetArrayLength(wordArray); int codePoints[codePointLength]; env->GetIntArrayRegion(wordArray, 0, codePointLength, codePoints); return dictionary->getFrequency(codePoints, codePointLength); return dictionary->getProbability(codePoints, codePointLength); } static jboolean latinime_BinaryDictionary_isValidBigram(JNIEnv *env, jclass clazz, jlong dict, Loading Loading @@ -285,8 +285,8 @@ static JNINativeMethod sMethods[] = { {"closeNative", "(J)V", reinterpret_cast<void *>(latinime_BinaryDictionary_close)}, {"getSuggestionsNative", "(JJJ[I[I[I[I[IIIZ[IZ[I[I[I[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getSuggestions)}, {"getFrequencyNative", "(J[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getFrequency)}, {"getProbabilityNative", "(J[I)I", reinterpret_cast<void *>(latinime_BinaryDictionary_getProbability)}, {"isValidBigramNative", "(J[I[I)Z", reinterpret_cast<void *>(latinime_BinaryDictionary_isValidBigram)}, {"calcNormalizedScoreNative", "([I[II)F", Loading
native/jni/src/bigram_dictionary.cpp +23 −22 Original line number Diff line number Diff line Loading @@ -36,21 +36,21 @@ BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT BigramDictionary::~BigramDictionary() { } void BigramDictionary::addWordBigram(int *word, int length, int frequency, int *bigramFreq, void BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); #endif } // Find the right insertion point int insertAt = 0; while (insertAt < MAX_RESULTS) { if (frequency > bigramFreq[insertAt] || (bigramFreq[insertAt] == frequency if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability && length < getCodePointCount(MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { break; Loading @@ -63,10 +63,10 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * if (insertAt >= MAX_RESULTS) { return; } memmove(bigramFreq + (insertAt + 1), bigramFreq + insertAt, (MAX_RESULTS - insertAt - 1) * sizeof(bigramFreq[0])); bigramFreq[insertAt] = frequency; memmove(bigramProbability + (insertAt + 1), bigramProbability + insertAt, (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); bigramProbability[insertAt] = probability; outputTypes[insertAt] = Dictionary::KIND_PREDICTION; memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, bigramCodePoints + insertAt * MAX_WORD_LENGTH, Loading @@ -87,7 +87,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions. * inputSize: the size of the codes array. * bigramCodePoints: an array for output, at the same format as outwords for getSuggestions. * bigramFreq: an array to output frequencies. * bigramProbability: an array to output frequencies. * outputTypes: an array to output types. * This method returns the number of bigrams this word has, for backward compatibility. * Note: this is not the number of bigrams output in the array, which is the number of Loading @@ -98,7 +98,7 @@ void BigramDictionary::addWordBigram(int *word, int length, int frequency, int * * reduce their scope to the ones that match the first letter. */ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, int inputSize, int *bigramCodePoints, int *bigramFreq, int *outputTypes) const { int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name Loading @@ -118,23 +118,24 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); int bigramBuffer[MAX_WORD_LENGTH]; int unigramFreq = 0; int unigramProbability = 0; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, bigramBuffer, &unigramFreq); bigramBuffer, &unigramProbability); // inputSize == 0 means we are trying to find bigram predictions. if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { const int bigramFreqTemp = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; // Due to space constraints, the frequency for bigrams is approximate - the lower the // unigram frequency, the worse the precision. The theoritical maximum error in // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 const int bigramProbabilityTemp = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; // Due to space constraints, the probability for bigrams is approximate - the lower the // unigram probability, the worse the precision. The theoritical maximum error in // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. const int frequency = BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreqTemp); addWordBigram(bigramBuffer, length, frequency, bigramFreq, bigramCodePoints, const int probability = BinaryFormat::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); ++bigramCount; } Loading @@ -159,13 +160,13 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const { memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE); const uint8_t *const root = DICT_ROOT; Loading @@ -181,10 +182,10 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int *prevW uint8_t bigramFlags; do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); const int frequency = BinaryFormat::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; const int probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); (*map)[bigramPos] = frequency; (*map)[bigramPos] = probability; setInFilter(filter, bigramPos); } while (0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)); } Loading
native/jni/src/bigram_dictionary.h +3 −3 Original line number Diff line number Diff line Loading @@ -29,14 +29,14 @@ class BigramDictionary { BigramDictionary(const uint8_t *const streamStart); int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const; void fillBigramAddressToFrequencyMapAndFilter(const int *prevWord, const int prevWordLength, void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; ~BigramDictionary(); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); void addWordBigram(int *word, int length, int frequency, int *bigramFreq, int *bigramCodePoints, int *outputTypes) const; void addWordBigram(int *word, int length, int probability, int *bigramProbability, int *bigramCodePoints, int *outputTypes) const; bool checkFirstCharacter(int *word, int *inputCodePoints) const; int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; Loading
native/jni/src/binary_format.h +44 −40 Original line number Diff line number Diff line Loading @@ -52,10 +52,10 @@ class BinaryFormat { // Flag for sign of offset. If this flag is set, the offset value must be negated. static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; // Mask for attribute frequency, stored on 4 bits inside the flags byte. static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; // The numeric value of the shortcut frequency that means 'whitelist'. static const int WHITELIST_SHORTCUT_FREQUENCY = 15; // Mask for attribute probability, stored on 4 bits inside the flags byte. static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F; // The numeric value of the shortcut probability that means 'whitelist'. static const int WHITELIST_SHORTCUT_PROBABILITY = 15; // Mask and flags for attribute address type selection. static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; Loading @@ -72,10 +72,10 @@ class BinaryFormat { static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos); static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos); static int skipOtherCharacters(const uint8_t *const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); static int skipProbability(const uint8_t flags, const int pos); static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); Loading @@ -83,14 +83,15 @@ class BinaryFormat { static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t *const dict, const uint8_t flags, int *pos); static int getAttributeFrequencyFromFlags(const int flags); static int getAttributeProbabilityFromFlags(const int flags); static int getTerminalPosition(const uint8_t *const root, const int *const inWord, const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramFrequency); static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); int *outWord, int *outUnigramProbability); static int computeProbabilityForBigram( const int unigramProbability, const int bigramProbability); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); const uint8_t *bigramFilter, const int unigramProbability); // Flags for special processing // Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or Loading Loading @@ -264,7 +265,7 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *c } } inline int BinaryFormat::readFrequencyWithoutMovingPointer(const uint8_t *const dict, inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos) { return dict[pos]; } Loading Loading @@ -320,7 +321,7 @@ inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos return pos + childrenAddressSize(flags); } inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) { return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } Loading Loading @@ -415,8 +416,8 @@ AK_FORCE_INLINE int BinaryFormat::getAttributeAddressAndForwardPointer(const uin } } inline int BinaryFormat::getAttributeFrequencyFromFlags(const int flags) { return flags & MASK_ATTRIBUTE_FREQUENCY; inline int BinaryFormat::getAttributeProbabilityFromFlags(const int flags) { return flags & MASK_ATTRIBUTE_PROBABILITY; } // This function gets the byte position of the last chargroup of the exact matching word in the Loading Loading @@ -466,7 +467,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (wordPos == length) { return charGroupPos; } pos = BinaryFormat::skipFrequency(FLAG_IS_TERMINAL, pos); pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); } if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { return NOT_VALID_WORD; Loading @@ -481,7 +482,7 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = BinaryFormat::skipOtherCharacters(root, pos); } pos = BinaryFormat::skipFrequency(flags, pos); pos = BinaryFormat::skipProbability(flags, pos); pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); } --charGroupCount; Loading @@ -504,11 +505,11 @@ AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. * outUnigramFrequency: a pointer to an int to write the frequency into. * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the length of the word, of 0 if the word was not found. */ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramFrequency) { const int maxDepth, int *outWord, int *outUnigramProbability) { int pos = 0; int wordPos = 0; Loading Loading @@ -541,15 +542,15 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co nextChar = getCodePointAndForwardPointer(root, &pos); } } *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the // first and possibly the frequency. // first and possibly the probability. if (FLAG_HAS_MULTIPLE_CHARS & flags) { pos = skipOtherCharacters(root, pos); } pos = skipFrequency(flags, pos); pos = skipProbability(flags, pos); // The fact that this group has children is very important. Since we already know // that this group does not match, if it has no children we know it is irrelevant Loading Loading @@ -604,9 +605,9 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co } } ++wordPos; // Now we only need to branch to the children address. Skip the frequency if // Now we only need to branch to the children address. Skip the probability if // it's there, read pos, and break to resume the search at pos. lastCandidateGroupPos = skipFrequency(lastFlags, lastCandidateGroupPos); lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos); pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); break; } else { Loading Loading @@ -635,36 +636,39 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } static inline int backoff(const int unigramFreq) { return unigramFreq; static inline int backoff(const int unigramProbability) { return unigramProbability; // For some reason, applying the backoff weight gives bad results in tests. To apply the // backoff weight, we divide the probability by 2, which in our storing format means // decreasing the score by 8. // TODO: figure out what's wrong with this. // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); } inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the // unigram frequency to be the median value of the 17th step from the top. A value of // 0 for the bigram frequency represents the middle of the 16th step from the top, inline int BinaryFormat::computeProbabilityForBigram( const int unigramProbability, const int bigramProbability) { // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the // unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. // See makedict.BinaryDictInputOutput for details. const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize); const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { if (!bigramMap || !bigramFilter) return backoff(unigramFreq); if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq); const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); if (bigramFreqIt != bigramMap->end()) { const int bigramFreq = bigramFreqIt->second; return computeFrequencyForBigram(unigramFreq, bigramFreq); } return backoff(unigramFreq); const uint8_t *bigramFilter, const int unigramProbability) { if (!bigramMap || !bigramFilter) return backoff(unigramProbability); if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); const std::map<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; return computeProbabilityForBigram(unigramProbability, bigramProbability); } return backoff(unigramProbability); } } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H