Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c1e86d3f authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Skip PtNodes with non-Unicode code points for suggestion."

parents 83be21a5 79ba6334
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
#include "utils/char_utils.h"

namespace latinime {

@@ -158,6 +159,10 @@ class PtNodeParams {
        return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
    }

    AK_FORCE_INLINE bool representsNonWordInfo() const {
        return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]);
    }

    // Parent node position
    AK_FORCE_INLINE int getParentPos() const {
        return mParentPos;
+10 −6
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
#include "utils/char_utils.h"

namespace latinime {

@@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
    PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
            getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
            &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
    // Skip PtNodes don't start with Unicode code point because they represent non-word information.
    if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
        childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
                PatriciaTrieReadingUtils::isTerminal(flags),
                PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
                PatriciaTrieReadingUtils::isBlacklisted(flags)
                        || PatriciaTrieReadingUtils::isNotAWord(flags),
                mergedNodeCodePointCount, mergedNodeCodePoints);
    }
    return siblingPos;
}

+5 −1
Original line number Diff line number Diff line
@@ -59,13 +59,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
            // valid terminal DicNode.
            isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
        }
        readingHelper.readNextSiblingNode(ptNodeParams);
        if (!ptNodeParams.representsNonWordInfo()) {
            // Skip PtNodes that represent non-word information.
            continue;
        }
        childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
                ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
                ptNodeParams.hasChildren(),
                ptNodeParams.isBlacklisted()
                        || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
                ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
        readingHelper.readNextSiblingNode(ptNodeParams);
    }
    if (readingHelper.isError()) {
        mIsCorrupted = true;
+3 −0
Original line number Diff line number Diff line
@@ -22,6 +22,9 @@

namespace latinime {

const int CharUtils::MIN_UNICODE_CODE_POINT = 0;
const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF;

struct LatinCapitalSmallPair {
  unsigned short capital;
  unsigned short small;
+7 −0
Original line number Diff line number Diff line
@@ -86,12 +86,19 @@ class CharUtils {
        return spaceCount;
    }

    static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
        return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
    }

    static unsigned short latin_tolower(const unsigned short c);
    static const std::vector<int> EMPTY_STRING;

 private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);

    static const int MIN_UNICODE_CODE_POINT;
    static const int MAX_UNICODE_CODE_POINT;

    /**
     * Table mapping most combined Latin, Greek, and Cyrillic characters
     * to their base characters.  If c is in range, BASE_CHARS[c] == c