Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e9a86e2c authored by Jean Chalard's avatar Jean Chalard
Browse files

Search bigrams for the lower case version of the word (A46)

...if there aren't any for the exact case version.

Bug: 6752830
Change-Id: I2737148b01ba04a64febe009ceb2ef53c265d224
parent ac3bd961
Loading
Loading
Loading
Loading
+0 −10
Original line number Diff line number Diff line
@@ -177,19 +177,9 @@ public class Suggest {
        if (wordComposer.size() <= 1 && isCorrectionEnabled) {
            // At first character typed, search only the bigrams
            if (!TextUtils.isEmpty(prevWordForBigram)) {
                final CharSequence lowerPrevWord;
                if (StringUtils.hasUpperCase(prevWordForBigram)) {
                    // TODO: Must pay attention to locale when changing case.
                    lowerPrevWord = prevWordForBigram.toString().toLowerCase();
                } else {
                    lowerPrevWord = null;
                }
                for (final String key : mDictionaries.keySet()) {
                    final Dictionary dictionary = mDictionaries.get(key);
                    suggestionsSet.addAll(dictionary.getBigrams(wordComposer, prevWordForBigram));
                    if (null != lowerPrevWord) {
                        suggestionsSet.addAll(dictionary.getBigrams(wordComposer, lowerPrevWord));
                    }
                }
            }
        } else if (wordComposer.size() > 1) {
+5 −5
Original line number Diff line number Diff line
@@ -98,11 +98,11 @@ public class UserHistoryDictionaryBigramList {
    }

    public HashMap<String, Byte> getBigrams(String word1) {
        if (!mBigramMap.containsKey(word1)) {
        if (mBigramMap.containsKey(word1)) return mBigramMap.get(word1);
        // TODO: lower case according to locale
        final String lowerWord1 = word1.toLowerCase();
        if (mBigramMap.containsKey(lowerWord1)) return mBigramMap.get(lowerWord1);
        return EMPTY_BIGRAM_MAP;
        } else {
            return mBigramMap.get(word1);
        }
    }

    public boolean removeBigram(String word1, String word2) {
+21 −6
Original line number Diff line number Diff line
@@ -105,8 +105,15 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(prevWord, prevWordLength);
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    if (0 == pos) {
        // If no bigrams for this exact word, search again in lower case.
        pos = getBigramListPositionForWord(prevWord, prevWordLength,
                true /* forceLowerCaseSearch */);
    }
    // If still no bigrams, we really don't have them!
    if (0 == pos) return 0;
    int bigramFlags;
    int bigramCount = 0;
@@ -141,10 +148,11 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in
// Returns a pointer to the start of the bigram list.
// If the word is not found or has no bigrams, this function returns 0.
int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
        const int prevWordLength) const {
        const int prevWordLength, const bool forceLowerCaseSearch) const {
    if (0 >= prevWordLength) return 0;
    const uint8_t* const root = DICT;
    int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength);
    int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
            forceLowerCaseSearch);

    if (NOT_VALID_WORD == pos) return 0;
    const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
@@ -164,7 +172,13 @@ void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *p
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(prevWord, prevWordLength);
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    if (0 == pos) {
        // If no bigrams for this exact string, search again in lower case.
        pos = getBigramListPositionForWord(prevWord, prevWordLength,
                true /* forceLowerCaseSearch */);
    }
    if (0 == pos) return;

    int bigramFlags;
@@ -197,10 +211,11 @@ bool BigramDictionary::checkFirstCharacter(unsigned short *word, int *inputCodes
bool BigramDictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2,
        int length2) const {
    const uint8_t* const root = DICT;
    int pos = getBigramListPositionForWord(word1, length1);
    int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    if (0 == pos) return false;
    int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2);
    int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
            false /* forceLowerCaseSearch */);
    if (NOT_VALID_WORD == nextWordPos) return false;
    int bigramFlags;
    do {
+2 −1
Original line number Diff line number Diff line
@@ -30,7 +30,8 @@ class BigramDictionary {
    BigramDictionary(const unsigned char *dict, int maxWordLength);
    int getBigrams(const int32_t *word, int length, int *inputCodes, int codesSize,
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams) const;
    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength) const;
    int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength,
            const bool forceLowerCaseSearch) const;
    void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
            std::map<int, int> *map, uint8_t *filter) const;
    bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2) const;
+4 −3
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@

#include <limits>
#include "bloom_filter.h"
#include "char_utils.h"
#include "unigram_dictionary.h"

namespace latinime {
@@ -65,7 +66,7 @@ class BinaryFormat {
    static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
            int *pos);
    static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord,
            const int length);
            const int length, const bool forceLowerCaseSearch);
    static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
            uint16_t* outWord, int* outUnigramFrequency);
    static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
@@ -309,7 +310,7 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
// This function gets the byte position of the last chargroup of the exact matching word in the
// dictionary. If no match is found, it returns NOT_VALID_WORD.
inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
        const int32_t* const inWord, const int length) {
        const int32_t* const inWord, const int length, const bool forceLowerCaseSearch) {
    int pos = 0;
    int wordPos = 0;

@@ -318,7 +319,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
        // there was no match (or we would have found it).
        if (wordPos > length) return NOT_VALID_WORD;
        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
        const int32_t wChar = inWord[wordPos];
        const int32_t wChar = forceLowerCaseSearch ? toLowerCase(inWord[wordPos]) : inWord[wordPos];
        while (true) {
            // If there are no more character groups in this node, it means we could not
            // find a matching character for this depth, therefore there is no match.
Loading