Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d34dd5bb authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Cosmetic fixes and a bug fix in...

Merge "Cosmetic fixes and a bug fix in UnigramDictionary::testCharGroupForContinuedLikeness()." into jb-mr1-dev
parents 2c0c1cc6 f2789819
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -68,9 +68,9 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jobject object,
        return 0;
    }
    int pagesize = getpagesize();
    adjust = dictOffset % pagesize;
    int adjDictOffset = dictOffset - adjust;
    int adjDictSize = dictSize + adjust;
    adjust = static_cast<int>(dictOffset) % pagesize;
    int adjDictOffset = static_cast<int>(dictOffset) - adjust;
    int adjDictSize = static_cast<int>(dictSize) + adjust;
    dictBuf = mmap(0, sizeof(char) * adjDictSize, PROT_READ, MAP_PRIVATE, fd, adjDictOffset);
    if (dictBuf == MAP_FAILED) {
        AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno);
@@ -120,8 +120,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jobject object,
        releaseDictBuf(dictBuf, 0, 0);
#endif // USE_MMAP_FOR_DICTIONARY
    } else {
        dictionary = new Dictionary(dictBuf, dictSize, fd, adjust, typedLetterMultiplier,
                fullWordMultiplier, maxWordLength, maxWords, maxPredictions);
        dictionary = new Dictionary(dictBuf, static_cast<int>(dictSize), fd, adjust,
                typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxPredictions);
    }
    PROF_END(66);
    PROF_CLOSE;
+3 −1
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@
 * limitations under the License.
 */

#include <stdint.h>

#include "char_utils.h"

namespace latinime {
@@ -24,7 +26,7 @@ namespace latinime {
 * if c is not a combined character, or the base character if it
 * is combined.
 */
const unsigned short BASE_CHARS[BASE_CHARS_SIZE] = {
const uint16_t BASE_CHARS[BASE_CHARS_SIZE] = {
    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
    0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
    0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+1 −1
Original line number Diff line number Diff line
@@ -156,7 +156,7 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
    const int flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
    if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) return 0;
    if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) {
        BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
        BinaryFormat::getCodePointAndForwardPointer(root, &pos);
    } else {
        pos = BinaryFormat::skipOtherCharacters(root, pos);
    }
+21 −21
Original line number Diff line number Diff line
@@ -84,7 +84,7 @@ class BinaryFormat {
    static unsigned int getFlags(const uint8_t *const dict);
    static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
    static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
    static int32_t getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos);
    static int32_t getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
    static int readFrequencyWithoutMovingPointer(const uint8_t *const dict, const int pos);
    static int skipOtherCharacters(const uint8_t *const dict, const int pos);
    static int skipChildrenPosition(const uint8_t flags, const int pos);
@@ -176,22 +176,22 @@ inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict
    return dict[(*pos)++];
}

inline int32_t BinaryFormat::getCharCodeAndForwardPointer(const uint8_t *const dict, int *pos) {
inline int32_t BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, int *pos) {
    const int origin = *pos;
    const int32_t character = dict[origin];
    if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
        if (character == CHARACTER_ARRAY_TERMINATOR) {
    const int32_t codePoint = dict[origin];
    if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) {
        if (codePoint == CHARACTER_ARRAY_TERMINATOR) {
            *pos = origin + 1;
            return NOT_A_CHARACTER;
            return NOT_A_CODE_POINT;
        } else {
            *pos = origin + 3;
            const int32_t char_1 = character << 16;
            const int32_t char_1 = codePoint << 16;
            const int32_t char_2 = char_1 + (dict[origin + 1] << 8);
            return char_2 + dict[origin + 2];
        }
    } else {
        *pos = origin + 1;
        return character;
        return codePoint;
    }
}

@@ -369,15 +369,15 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
            if (0 >= charGroupCount) return NOT_VALID_WORD;
            const int charGroupPos = pos;
            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
            int32_t character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
            if (character == wChar) {
                // This is the correct node. Only one character group may start with the same
                // char within a node, so either we found our match in this node, or there is
                // no match and we can return NOT_VALID_WORD. So we will check all the characters
                // in this character group indeed does match.
                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                    while (NOT_A_CHARACTER != character) {
                    character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
                    while (NOT_A_CODE_POINT != character) {
                        ++wordPos;
                        // If we shoot the length of the word we search for, or if we find a single
                        // character that does not match, as explained above, it means the word is
@@ -385,7 +385,7 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t *const root,
                        // match the word on the first character, but not matching the whole word).
                        if (wordPos > length) return NOT_VALID_WORD;
                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                        character = BinaryFormat::getCodePointAndForwardPointer(root, &pos);
                    }
                }
                // If we come here we know that so far, we do match. Either we are on a terminal
@@ -457,19 +457,19 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
                 --charGroupCount) {
            const int startPos = pos;
            const uint8_t flags = getFlagsAndForwardPointer(root, &pos);
            const int32_t character = getCharCodeAndForwardPointer(root, &pos);
            const int32_t character = getCodePointAndForwardPointer(root, &pos);
            if (address == startPos) {
                // We found the address. Copy the rest of the word in the buffer and return
                // the length.
                outWord[wordPos] = character;
                if (FLAG_HAS_MULTIPLE_CHARS & flags) {
                    int32_t nextChar = getCharCodeAndForwardPointer(root, &pos);
                    int32_t nextChar = getCodePointAndForwardPointer(root, &pos);
                    // We count chars in order to avoid infinite loops if the file is broken or
                    // if there is some other bug
                    int charCount = maxDepth;
                    while (NOT_A_CHARACTER != nextChar && --charCount > 0) {
                    while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
                        outWord[++wordPos] = nextChar;
                        nextChar = getCharCodeAndForwardPointer(root, &pos);
                        nextChar = getCodePointAndForwardPointer(root, &pos);
                    }
                }
                *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos);
@@ -523,16 +523,16 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t *const root, const int a
                    const uint8_t lastFlags =
                            getFlagsAndForwardPointer(root, &lastCandidateGroupPos);
                    const int32_t lastChar =
                            getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
                            getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
                    // We copy all the characters in this group to the buffer
                    outWord[wordPos] = lastChar;
                    if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) {
                        int32_t nextChar =
                                getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
                                getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
                        int charCount = maxDepth;
                        while (-1 != nextChar && --charCount > 0) {
                            outWord[++wordPos] = nextChar;
                            nextChar = getCharCodeAndForwardPointer(root, &lastCandidateGroupPos);
                            nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos);
                        }
                    }
                    ++wordPos;
@@ -582,8 +582,8 @@ inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const
    // 0 for the bigram frequency represents the middle of the 16th step from the top,
    // while a value of 15 represents the middle of the top step.
    // See makedict.BinaryDictInputOutput for details.
    const float stepSize = (static_cast<float>(MAX_FREQ) - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
    return static_cast<int>(unigramFreq + (bigramFreq + 1) * stepSize);
    const float stepSize = static_cast<float>(MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
    return unigramFreq + static_cast<int>(static_cast<float>(bigramFreq + 1) * stepSize);
}

// This returns a probability in log space.
+8 −6
Original line number Diff line number Diff line
@@ -23,14 +23,16 @@

namespace latinime {

static inline void setInFilter(uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    filter[bucket >> 3] |= (1 << (bucket & 0x7));
// TODO: uint32_t position
static inline void setInFilter(uint8_t *filter, const int32_t position) {
    const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
    filter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
}

static inline bool isInFilter(const uint8_t *filter, const int position) {
    const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
    return filter[bucket >> 3] & (1 << (bucket & 0x7));
// TODO: uint32_t position
static inline bool isInFilter(const uint8_t *filter, const int32_t position) {
    const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
    return filter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7));
}
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H
Loading