Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 712fefd8 authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Use BinaryDictonaryInfo instead of raw pointers."

parents 88ad30f4 0ecfb942
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -58,6 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \
        dic_nodes_cache.cpp) \
    $(addprefix suggest/core/dictionary/, \
        char_utils.cpp \
        binary_dictionary_format.cpp \
        byte_array_utils.cpp \
        dictionary.cpp \
        digraph_utils.cpp) \
    $(addprefix suggest/core/layout/, \
+5 −4
Original line number Diff line number Diff line
@@ -35,7 +35,8 @@
#include "jni.h"
#include "jni_common.h"
#include "suggest_options.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/binary_dictionary_format.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/dictionary.h"

namespace latinime {
@@ -110,8 +111,8 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s
        return 0;
    }
    Dictionary *dictionary = 0;
    if (BinaryFormat::UNKNOWN_FORMAT
            == BinaryFormat::detectFormat(static_cast<uint8_t *>(dictBuf),
    if (BinaryDictionaryFormat::UNKNOWN_VERSION
            == BinaryDictionaryFormat::detectFormatVersion(static_cast<uint8_t *>(dictBuf),
                    static_cast<int>(dictSize))) {
        AKLOGE("DICT: dictionary format is unknown, bad magic number");
#ifdef USE_MMAP_FOR_DICTIONARY
@@ -260,7 +261,7 @@ static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jclass clazz, ji
static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) {
    Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
    if (!dictionary) return;
    const void *dictBuf = dictionary->getDict();
    const void *dictBuf = dictionary->getBinaryDictionaryInfo()->getDictBuf();
    if (!dictBuf) return;
#ifdef USE_MMAP_FOR_DICTIONARY
    releaseDictBuf(static_cast<const char *>(dictBuf) - dictionary->getDictBufAdjust(),
+7 −5
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "bigram_dictionary.h"

#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/char_utils.h"
@@ -29,7 +30,8 @@

namespace latinime {

BigramDictionary::BigramDictionary(const uint8_t *const streamStart) : DICT_ROOT(streamStart) {
BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo)
        : mBinaryDictionaryInfo(binaryDictionaryInfo) {
    if (DEBUG_DICT) {
        AKLOGI("BigramDictionary - constructor");
    }
@@ -104,7 +106,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name

    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
@@ -150,7 +152,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
        const bool forceLowerCaseSearch) const {
    if (0 >= prevWordLength) return 0;
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = BinaryFormat::getTerminalPosition(root, prevWord, prevWordLength,
            forceLowerCaseSearch);

@@ -171,7 +173,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
void BigramDictionary::fillBigramAddressToProbabilityMapAndFilter(const int *prevWord,
        const int prevWordLength, std::map<int, int> *map, uint8_t *filter) const {
    memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
            false /* forceLowerCaseSearch */);
    if (0 == pos) {
@@ -210,7 +212,7 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons

bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
        int length2) const {
    const uint8_t *const root = DICT_ROOT;
    const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
    int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
    if (0 == pos) return false;
+6 −2
Original line number Diff line number Diff line
@@ -24,9 +24,12 @@

namespace latinime {

class BinaryDictionaryInfo;

class BigramDictionary {
 public:
    BigramDictionary(const uint8_t *const streamStart);
    BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);

    int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
            int *frequencies, int *outputTypes) const;
    void fillBigramAddressToProbabilityMapAndFilter(const int *prevWord, const int prevWordLength,
@@ -35,13 +38,14 @@ class BigramDictionary {
    ~BigramDictionary();
 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary);

    void addWordBigram(int *word, int length, int probability, int *bigramProbability,
            int *bigramCodePoints, int *outputTypes) const;
    bool checkFirstCharacter(int *word, int *inputCodePoints) const;
    int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
            const bool forceLowerCaseSearch) const;

    const uint8_t *const DICT_ROOT;
    const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
    // TODO: Re-implement proximity correction for bigram correction
    static const int MAX_ALTERNATIVES = 1;
};
+54 −38
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_utils.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/char_utils.h"
#include "suggest/core/dictionary/multi_bigram_map.h"
@@ -33,20 +34,23 @@ namespace latinime {
// Node initialization utils //
///////////////////////////////

/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot,
        const int prevWordNodePos, DicNode *newRootNode) {
    int curPos = rootPos;
/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const int prevWordNodePos, DicNode *const newRootNode) {
    int curPos = binaryDictionaryInfo->getRootPosition();
    const int pos = curPos;
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &curPos);
    const int childrenPos = curPos;
    newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos);
}

/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos,
        const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) {
    int curPos = rootPos;
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        DicNode *const prevWordLastNode, DicNode *const newRootNode) {
    int curPos = binaryDictionaryInfo->getRootPosition();
    const int pos = curPos;
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos);
    const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &curPos);
    const int childrenPos = curPos;
    newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount);
}
@@ -72,16 +76,19 @@ namespace latinime {
}

/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
        const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState,
        const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter,
        const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalDepth,
        const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
        const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
        DicNodeVector *childDicNodes) {
    int nextPos = pos;
    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos);
    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &pos);
    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
    const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
    const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);

    int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos);
    int codePoint = BinaryFormat::getCodePointAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &pos);
    ASSERT(NOT_A_CODE_POINT != codePoint);
    const int nodeCodePoint = codePoint;
    // TODO: optimize this
@@ -91,7 +98,8 @@ namespace latinime {

    do {
        const int nextCodePoint = hasMultipleChars
                ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT;
                ? BinaryFormat::getCodePointAndForwardPointer(
                        binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
        const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
        if (!isLastChar) {
            additionalWordBuf[additionalSubwordLength++] = nextCodePoint;
@@ -99,12 +107,14 @@ namespace latinime {
        codePoint = nextCodePoint;
    } while (NOT_A_CODE_POINT != codePoint);

    const int probability =
            isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1;
    const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
            binaryDictionaryInfo->getDictRoot(), pos) : -1;
    pos = BinaryFormat::skipProbability(flags, pos);
    int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0;
    int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
            binaryDictionaryInfo->getDictRoot(), flags, pos) : 0;
    const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos);
    const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos);
    const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
            binaryDictionaryInfo->getDictRoot(), flags, pos);

    if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) {
        return siblingPos;
@@ -112,8 +122,8 @@ namespace latinime {
    if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) {
        return siblingPos;
    }
    const int childrenCount = hasChildren
            ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0;
    const int childrenCount = hasChildren ? BinaryFormat::getGroupCountAndForwardPointer(
            binaryDictionaryInfo->getDictRoot(), &childrenPos) : 0;
    childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos,
            nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal,
            hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf);
@@ -149,16 +159,18 @@ namespace latinime {
}

/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
        const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
        const bool exactOnly, const std::vector<int> *const codePointsFilter,
        const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly,
        const std::vector<int> *const codePointsFilter, const ProximityInfo *const pInfo,
        DicNodeVector *childDicNodes) {
    const int terminalDepth = dicNode->getLeavingDepth();
    const int childCount = dicNode->getChildrenCount();
    int nextPos = dicNode->getChildrenPos();
    for (int i = 0; i < childCount; i++) {
        const int filterSize = codePointsFilter ? codePointsFilter->size() : 0;
        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState,
                pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes);
        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
                terminalDepth, pInfoState, pointIndex, exactOnly, codePointsFilter, pInfo,
                childDicNodes);
        if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) {
            // All code points have been found.
            break;
@@ -166,14 +178,15 @@ namespace latinime {
    }
}

/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot,
        DicNodeVector *childDicNodes) {
    getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes);
/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode,
        const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) {
    getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes);
}

/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
        const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex,
        bool exactOnly, DicNodeVector *childDicNodes) {
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
        DicNodeVector *childDicNodes) {
    if (dicNode->isTotalInputSizeExceedingLimit()) {
        return;
    }
@@ -181,9 +194,9 @@ namespace latinime {
        DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly,
                childDicNodes);
    } else {
        DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex,
                exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */,
                childDicNodes);
        DicNodeUtils::createAndGetAllLeavingChildNodes(
                dicNode, binaryDictionaryInfo, pInfoState, pointIndex, exactOnly,
                0 /* codePointsFilter */, 0 /* pInfo */, childDicNodes);
    }
}

@@ -193,19 +206,21 @@ namespace latinime {
/**
 * Computes the combined bigram / unigram cost for the given dicNode.
 */
/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot,
/* static */ float DicNodeUtils::getBigramNodeImprobability(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const DicNode *const node, MultiBigramMap *multiBigramMap) {
    if (node->isImpossibleBigramWord()) {
        return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
    }
    const int probability = getBigramNodeProbability(dicRoot, node, multiBigramMap);
    const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap);
    // TODO: This equation to calculate the improbability looks unreasonable.  Investigate this.
    const float cost = static_cast<float>(MAX_PROBABILITY - probability)
            / static_cast<float>(MAX_PROBABILITY);
    return cost;
}

/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot,
/* static */ int DicNodeUtils::getBigramNodeProbability(
        const BinaryDictionaryInfo *const binaryDictionaryInfo,
        const DicNode *const node, MultiBigramMap *multiBigramMap) {
    const int unigramProbability = node->getProbability();
    const int wordPos = node->getPos();
@@ -216,9 +231,10 @@ namespace latinime {
    }
    if (multiBigramMap) {
        return multiBigramMap->getBigramProbability(
                dicRoot, prevWordPos, wordPos, unigramProbability);
                binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
    }
    return BinaryFormat::getBigramProbability(dicRoot, prevWordPos, wordPos, unigramProbability);
    return BinaryFormat::getBigramProbability(
            binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
}

///////////////////////////////////////
Loading