Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9a933a74 authored by Jean Chalard's avatar Jean Chalard
Browse files

Read shortcuts as strings in the dictionary.

This has no impact on performance.
Before:
(0)  9.61 (0.01%)
(1)  57514.58 (56.70%)
(2)  10.55 (0.01%)
(3)  10.79 (0.01%)
(4)  133.20 (0.13%)
(5)  43553.87 (42.94%)
(6)  10.03 (0.01%)
(20) 47.20 (0.05%)
Total 101431.47 (sum of others 101289.84)

After:
(0)  10.52 (0.01%)
(1)  56311.16 (56.66%)
(2)  13.40 (0.01%)
(3)  10.98 (0.01%)
(4)  136.72 (0.14%)
(5)  42707.92 (42.97%)
(6)  9.79 (0.01%)
(20) 51.35 (0.05%)
Total 99390.76 (sum of others 99251.84)

The difference is not significant with regard to measure imprecision

Change-Id: I2e4f1ef7a5e99082e67dd27f56cf4fc432bb48fa
parent 7540fd00
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -123,6 +123,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
    }
    pos = BinaryFormat::skipChildrenPosition(flags, pos);
    pos = BinaryFormat::skipFrequency(flags, pos);
    pos = BinaryFormat::skipShortcuts(root, flags, pos);
    int bigramFlags;
    int bigramCount = 0;
    do {
+31 −9
Original line number Diff line number Diff line
@@ -40,6 +40,9 @@ class BinaryFormat {
    // implementations. On this occasion, we made the magic number 32 bits long.
    const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;

    const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
    const static int SHORTCUT_LIST_SIZE_SIZE = 2;

    static int detectFormat(const uint8_t* const dict);
    static unsigned int getHeaderSize(const uint8_t* const dict);
    static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
@@ -47,9 +50,10 @@ class BinaryFormat {
    static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
    static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
    static int skipOtherCharacters(const uint8_t* const dict, const int pos);
    static int skipAttributes(const uint8_t* const dict, const int pos);
    static int skipChildrenPosition(const uint8_t flags, const int pos);
    static int skipFrequency(const uint8_t flags, const int pos);
    static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
    static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
    static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
    static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
            const int pos);
@@ -157,12 +161,12 @@ static inline int attributeAddressSize(const uint8_t flags) {
    */
}

inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
    int currentPos = pos;
    uint8_t flags = getFlagsAndForwardPointer(dict, &currentPos);
    uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
        currentPos += attributeAddressSize(flags);
        flags = getFlagsAndForwardPointer(dict, &currentPos);
        flags = BinaryFormat::getFlagsAndForwardPointer(dict, &currentPos);
    }
    currentPos += attributeAddressSize(flags);
    return currentPos;
@@ -174,6 +178,10 @@ static inline int childrenAddressSize(const uint8_t flags) {
    /* See the note in attributeAddressSize. The same applies here */
}

static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
    return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
}

inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
    return pos + childrenAddressSize(flags);
}
@@ -182,16 +190,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
    return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
}

inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
        const int pos) {
    // This function skips all attributes: shortcuts and bigrams.
    int newPos = pos;
    if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
        newPos = skipAttributes(dict, newPos);
        return pos + shortcutByteSize(dict, pos);
    } else {
        return pos;
    }
}

inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
        const int pos) {
    if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
        newPos = skipAttributes(dict, newPos);
        return skipExistingBigrams(dict, pos);
    } else {
        return pos;
    }
}

inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
        const int pos) {
    // This function skips all attributes: shortcuts and bigrams.
    int newPos = pos;
    newPos = skipShortcuts(dict, flags, newPos);
    newPos = skipBigrams(dict, flags, newPos);
    return newPos;
}

+12 −8
Original line number Diff line number Diff line
@@ -45,13 +45,19 @@ class TerminalAttributes {

        // Gets the shortcut target itself as a uint16_t string. For parameters and return value
        // see BinaryFormat::getWordAtAddress.
        // TODO: make the output an uint32_t* to handle the whole unicode range.
        inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
            const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
            mHasNextShortcutTarget =
                    0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
            int shortcutAddress =
                    BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
            return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
            unsigned int i;
            for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
                const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
                if (NOT_A_CHARACTER == charCode) break;
                outWord[i] = (uint16_t)charCode;
            }
            mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
            return i;
        }
    };

@@ -65,12 +71,10 @@ class TerminalAttributes {
            mDict(dict), mFlags(flags), mStartPos(pos) {
    }

    inline bool isShortcutOnly() const {
        return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
    }

    inline ShortcutIterator getShortcutIterator() const {
        return ShortcutIterator(mDict, mStartPos, mFlags);
        // The size of the shortcuts is stored here so that the whole shortcut chunk can be
        // skipped quickly, so we ignore it.
        return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
    }
};
} // namespace latinime
+5 −5
Original line number Diff line number Diff line
@@ -366,10 +366,9 @@ inline void UnigramDictionary::onTerminal(const int freq,
        WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
        const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
        if (finalFreq != NOT_A_FREQUENCY) {
            if (!terminalAttributes.isShortcutOnly()) {
            addWord(wordPointer, wordLength, finalFreq, masterQueue);
            }

            const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
            // Please note that the shortcut candidates will be added to the master queue only.
            TerminalAttributes::ShortcutIterator iterator =
                    terminalAttributes.getShortcutIterator();
@@ -379,11 +378,12 @@ inline void UnigramDictionary::onTerminal(const int freq,
                // We need to either modulate the frequency of each shortcut according
                // to its own shortcut frequency or to make the queue
                // so that the insert order is protected inside the queue for words
                // with the same score.
                // with the same score. For the moment we use -1 to make sure the shortcut will
                // never be in front of the word.
                uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
                const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
                        MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
                addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
                addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
            }
        }
    }
+0 −4
Original line number Diff line number Diff line
@@ -49,10 +49,6 @@ class UnigramDictionary {
    static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
    // Flag for bigram presence
    static const int FLAG_HAS_BIGRAMS = 0x04;
    // Flag for shortcut-only words. Some words are shortcut-only, which means they match when
    // the user types them but they don't pop in the suggestion strip, only the words they are
    // shortcuts for do.
    static const int FLAG_IS_SHORTCUT_ONLY = 0x02;

    // Attribute (bigram/shortcut) related flags:
    // Flag for presence of more attributes