Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0ecfb942 authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi
Browse files

Use BinaryDictonaryInfo instead of raw pointers.

Bug: 6669677

Change-Id: I9792a872f1609de7c1ba0fc08d916047d6724c0b
parent d3ccd4bf
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -58,6 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \
        dic_nodes_cache.cpp) \
    $(addprefix suggest/core/dictionary/, \
        char_utils.cpp \
        binary_dictionary_format.cpp \
        byte_array_utils.cpp \
        dictionary.cpp \
        digraph_utils.cpp) \
    $(addprefix suggest/core/layout/, \
+5 −4
Original line number Diff line number Diff line
@@ -35,7 +35,8 @@
#include "jni.h"
#include "jni_common.h"
#include "suggest_options.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/binary_dictionary_format.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/dictionary.h"

namespace latinime {
@@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s
        return 0;
    }
    Dictionary *dictionary = 0;
    if (BinaryFormat::UNKNOWN_FORMAT
            == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf),
    if (BinaryDictionaryFormat::UNKNOWN_VERSION
            == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf),
                    static_cast<int>(dictSize))) {
        AKLOGE("DICT: dictionary format is unknown, bad magic number");
#ifdef USE_MMAP_FOR_DICTIONARY
@@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji
static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) {
    Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
    if (!dictionary) return;
    const void *dictBuf = dictionary->getDict();
    const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf();
    if (!dictBuf) return;
#ifdef USE_MMAP_FOR_DICTIONARY
    releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(),
+7 −5
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "bigram_dictionary.h"

#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/char_utils.h"
@@ -28,7 +29,8 @@

namespace latinime {

BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo)
        : mBinaryDictionaryInfo(binaryDictionaryInfo) {
    if (DEBUG_DICT) {
        AKLOGI("BigramDictionary - constructor");
    }
@@ -103,7 +105,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
@@ -149,7 +151,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
        const bool forceLowerCaseSearch) const {
    if (0 >= prevWordLength) return 0;
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
            forceLowerCaseSearch);

@@ -170,7 +172,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    if (0 == pos) {
@@ -209,7 +211,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons

bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
        int length2) const {
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    if (0 == pos) return false;
+6 −2
Original line number Diff line number Diff line
@@ -24,9 +24,12 @@

namespace latinime {

class BinaryDictionaryInfo;

class BigramDictionary {
 public:
    BigramDictionary(const uint8_t *const streamStart);
    BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);

    int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
            int *frequencies, int *outputTypes) const;
    void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
@@ -35,13 +38,14 @@ class BigramDictionary {
    ~BigramDictionary();
 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);

    void addWordBigram(int *word, int length, int probability, int *bigramProbability,
            int *bigramCodePoints, int *outputTypes) const;
    bool checkFirstCharacter(int *word, int *inputCodePoints) const;
    int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
            const bool forceLowerCaseSearch) const;

    const uint8_t *const DICT_ROOT;
    const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
    // TODO: Re-implement proximity correction for bigram correction
    static const int MAX_ALTERNATIVES = 1;
};
+54 −38
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_utils.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/char_utils.h"
#include "suggest/core/dictionary/multi_bigram_map.h"
@@ -32,20 +33,23 @@ namespace latinime {
// Node initialization utils //
///////////////////////////////

/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot,
        const int prevWordNodePos, DicNode *newRootNode) {
    int curPos = rootPos;
/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const int prevWordNodePos, DicNode *const newRootNode) {
    int curPos = binaryDictionaryInfo->getRootPosition();
    const int pos = curPos;
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &curPos);
    const int childrenPos = curPos;
    newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
}

/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos,
        const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) {
    int curPos = rootPos;
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        DicNode *const prevWordLastNode, DicNode *const newRootNode) {
    int curPos = binaryDictionaryInfo->getRootPosition();
    const int pos = curPos;
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &curPos);
    const int childrenPos = curPos;
    newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
}
@@ -71,16 +75,19 @@ namespace latinime {
}

/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
        const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState,
        const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter,
        const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
        const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
        const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
        DicNodeVector *childDicNodes) {
    int nextPos = pos;
    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &pos);
    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
    const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
    const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);

    int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
    int codePoint = BinaryFormat::getCodePointAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &pos);
    ASSERT(NOT_A_CODE_POINT != codePoint);
    const int nodeCodePoint = codePoint;
    // TODO: optimize this
@@ -90,7 +97,8 @@ namespace latinime {

    do {
        const int nextCodePoint = hasMultipleChars
                ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT;
                ? BinaryFormat::getCodePointAndForwardPointer(
                        binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
        const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
        if (!isLastChar) {
            additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
@@ -98,12 +106,14 @@ namespace latinime {
        codePoint = nextCodePoint;
    } while (NOT_A_CODE_POINT != codePoint);

    const int probability =
            isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1;
    const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
            binaryDictionaryInfo->getDictRoot(), pos) : -1;
    pos = BinaryFormat::skipProbability(flags, pos);
    int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0;
    int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
            binaryDictionaryInfo->getDictRoot(), flags, pos) : 0;
    const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
    const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos);
    const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
            binaryDictionaryInfo->getDictRoot(), flags, pos);

    if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
        return siblingPos;
@@ -111,8 +121,8 @@ namespace latinime {
    if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
        return siblingPos;
    }
    const int childrenCount = hasChildren
            ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0;
    const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0;
    childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
            nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
            hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
@@ -148,16 +158,18 @@ namespace latinime {
}

/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
        const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
        const bool exactOnly, const std::vector<int> *const codePointsFilter,
        const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
        const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
        DicNodeVector *childDicNodes) {
    const int terminalDepth = dicNode->getLeavingDepth();
    const int childCount = dicNode->getChildrenCount();
    int nextPos = dicNode->getChildrenPos();
    for (int i = 0; i < childCount; i++) {
        const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState,
                pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes);
        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
                terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo,
                childDicNodes);
        if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
            // All code points have been found.
            break;
@@ -165,14 +177,15 @@ namespace latinime {
    }
}

/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
        DicNodeVector *childDicNodes) {
    getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes);
/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode,
        const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) {
    getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes);
}

/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
        const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
        bool exactOnly, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
        DicNodeVector *childDicNodes) {
    if (dicNode->isTotalInputSizeExceedingLimit()) {
        return;
    }
@@ -180,9 +193,9 @@ namespace latinime {
        DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
                childDicNodes);
    } else {
        DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex,
                exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */,
                childDicNodes);
        DicNodeUtils::createAndGetAllLeavingChildNodes(
                dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly,
                0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes);
    }
}

@@ -192,19 +205,21 @@ namespace latinime {
/**
 * Computes the combined bigram / unigram cost for the given dicNode.
 */
/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot,
/* static */ float DicNodeUtils::getBigramNodeImprobability(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const DicNode *const node, MultiBigramMap *multiBigramMap) {
    if (node->isImpossibleBigramWord()) {
        return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
    }
    const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap);
    const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap);
    // TODO: This equation to calculate the improbability looks unreasonable.  Investigate this.
    const float cost = static_cast<float>(MAX_PROBABILITY - probability)
            / static_cast<float>(MAX_PROBABILITY);
    return cost;
}

/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot,
/* static */ int DicNodeUtils::getBigramNodeProbability(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const DicNode *const node, MultiBigramMap *multiBigramMap) {
    const int unigramProbability = node->getProbability();
    const int wordPos = node->getPos();
@@ -215,9 +230,10 @@ namespace latinime {
    }
    if (multiBigramMap) {
        return multiBigramMap->getBigramProbability(
                dicRoot, prevWordPos, wordPos, unigramProbability);
                binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
    }
    return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability);
    return BinaryFormat::getBigramProbability(
            binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
}

///////////////////////////////////////
Loading