Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e8750d97 authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi
Browse files

Introduce EntryCounters to count entries in a dictionary.

Bug: 14425059

Change-Id: Ic13ba827d96fa4a147485ba92fdb37e23e04e8e8
parent 1085fef8
Loading
Loading
Loading
Loading
+12 −8
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ const char *const HeaderPolicy::DATE_KEY = "date";
const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
const char *const HeaderPolicy::TRIGRAM_COUNT_KEY = "TRIGRAM_COUNT";
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
// Historical info is information that is needed to support decaying such as timestamp, level and
// count.
@@ -94,12 +95,11 @@ bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
}

bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
        const int unigramCount, const int bigramCount,
        const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const {
        const EntryCounts &entryCounts, const int extendedRegionSize,
        BufferWithExtendableBuffer *const outBuffer) const {
    int writingPos = 0;
    DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
    fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount,
            extendedRegionSize, &attributeMapToWrite);
    fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
    if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
            &writingPos)) {
        return false;
@@ -126,11 +126,15 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim
    return true;
}

void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount,
        const int bigramCount, const int extendedRegionSize,
void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
        const EntryCounts &entryCounts, const int extendedRegionSize,
        DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount);
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount);
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY,
            entryCounts.getUnigramCount());
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY,
            entryCounts.getBigramCount());
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, TRIGRAM_COUNT_KEY,
            entryCounts.getTrigramCount());
    HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
            extendedRegionSize);
    // Set the current time as the generation time.
+27 −7
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "defines.h"
#include "suggest/core/policy/dictionary_header_structure_policy.h"
#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
#include "suggest/policyimpl/dictionary/utils/entry_counters.h"
#include "suggest/policyimpl/dictionary/utils/format_utils.h"
#include "utils/char_utils.h"
#include "utils/time_keeper.h"
@@ -49,6 +50,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
                      UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
              mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
                      BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
              mTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
                      TRIGRAM_COUNT_KEY, 0 /* defaultValue */)),
              mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
                      EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
              mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
@@ -60,6 +63,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
                      &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
              mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
                      &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
              mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(
                      &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)),
              mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}

    // Constructs header information using an attribute map.
@@ -77,7 +82,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
                      DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
              mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
                      DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
              mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
              mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0),
              mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
                      &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
              mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
@@ -87,6 +92,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
                      &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
              mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
                      &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)),
              mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(
                      &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)),
              mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}

    // Copy header information
@@ -99,12 +106,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
              mIsDecayingDict(headerPolicy->mIsDecayingDict),
              mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
              mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount),
              mTrigramCount(headerPolicy->mTrigramCount),
              mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
              mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
              mForgettingCurveProbabilityValuesTableId(
                      headerPolicy->mForgettingCurveProbabilityValuesTableId),
              mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
              mMaxBigramCount(headerPolicy->mMaxBigramCount),
              mMaxTrigramCount(headerPolicy->mMaxTrigramCount),
              mCodePointTable(headerPolicy->mCodePointTable) {}

    // Temporary dummy header.
@@ -112,10 +121,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
            : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
              mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
              mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
              mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
              mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mTrigramCount(0),
              mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
              mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0),
              mCodePointTable(nullptr) {}
              mMaxTrigramCount(0), mCodePointTable(nullptr) {}

    ~HeaderPolicy() {}

@@ -183,6 +192,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
        return mBigramCount;
    }

    AK_FORCE_INLINE int getTrigramCount() const {
        return mTrigramCount;
    }

    AK_FORCE_INLINE int getExtendedRegionSize() const {
        return mExtendedRegionSize;
    }
@@ -212,15 +225,19 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
        return mMaxBigramCount;
    }

    AK_FORCE_INLINE int getMaxTrigramCount() const {
        return mMaxTrigramCount;
    }

    void readHeaderValueOrQuestionMark(const char *const key,
            int *outValue, int outValueSize) const;

    bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
            const int unigramCount, const int bigramCount,
            const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const;
            const EntryCounts &entryCounts, const int extendedRegionSize,
            BufferWithExtendableBuffer *const outBuffer) const;

    void fillInHeader(const bool updatesLastDecayedTime,
            const int unigramCount, const int bigramCount, const int extendedRegionSize,
    void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
            const int extendedRegionSize,
            DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;

    AK_FORCE_INLINE const std::vector<int> *getLocale() const {
@@ -245,6 +262,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
    static const char *const LAST_DECAYED_TIME_KEY;
    static const char *const UNIGRAM_COUNT_KEY;
    static const char *const BIGRAM_COUNT_KEY;
    static const char *const TRIGRAM_COUNT_KEY;
    static const char *const EXTENDED_REGION_SIZE_KEY;
    static const char *const HAS_HISTORICAL_INFO_KEY;
    static const char *const LOCALE_KEY;
@@ -273,11 +291,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
    const int mLastDecayedTime;
    const int mUnigramCount;
    const int mBigramCount;
    const int mTrigramCount;
    const int mExtendedRegionSize;
    const bool mHasHistoricalInfoOfWords;
    const int mForgettingCurveProbabilityValuesTableId;
    const int mMaxUnigramCount;
    const int mMaxBigramCount;
    const int mMaxTrigramCount;
    const int *const mCodePointTable;

    const std::vector<int> readLocale() const;
+9 −9
Original line number Diff line number Diff line
@@ -303,7 +303,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePo
    if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
            &addedNewUnigram)) {
        if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
            mUnigramCount++;
            mEntryCounters.incrementUnigramCount();
        }
        if (unigramProperty->getShortcuts().size() > 0) {
            // Add shortcut target.
@@ -397,7 +397,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex
    if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
            wordPos, ngramProperty, &addedNewBigram)) {
        if (addedNewBigram) {
            mBigramCount++;
            mEntryCounters.incrementBigramCount();
        }
        return true;
    } else {
@@ -438,7 +438,7 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramCon
    const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
    if (mUpdatingHelper.removeNgramEntry(
            PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
        mBigramCount--;
        mEntryCounters.decrementBigramCount();
        return true;
    } else {
        return false;
@@ -477,7 +477,7 @@ bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
        AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
        return false;
    }
    if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) {
    if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
        AKLOGE("Cannot flush the dictionary to file.");
        mIsCorrupted = true;
        return false;
@@ -515,7 +515,7 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
        // Needs to reduce dictionary size.
        return true;
    } else if (mHeaderPolicy->isDecayingDict()) {
        return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
        return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
                mHeaderPolicy);
    }
    return false;
@@ -525,19 +525,19 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
        char *const outResult, const int maxResultLength) {
    const int compareLength = queryLength + 1 /* terminator */;
    if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
        snprintf(outResult, maxResultLength, "%d", mUnigramCount);
        snprintf(outResult, maxResultLength, "%d", mEntryCounters.getUnigramCount());
    } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
        snprintf(outResult, maxResultLength, "%d", mBigramCount);
        snprintf(outResult, maxResultLength, "%d", mEntryCounters.getBigramCount());
    } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
        snprintf(outResult, maxResultLength, "%d",
                mHeaderPolicy->isDecayingDict() ?
                        ForgettingCurveUtils::getUnigramCountHardLimit(
                        ForgettingCurveUtils::getEntryCountHardLimit(
                                mHeaderPolicy->getMaxUnigramCount()) :
                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
    } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
        snprintf(outResult, maxResultLength, "%d",
                mHeaderPolicy->isDecayingDict() ?
                        ForgettingCurveUtils::getBigramCountHardLimit(
                        ForgettingCurveUtils::getEntryCountHardLimit(
                                mHeaderPolicy->getMaxBigramCount()) :
                        static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
    }
+4 −4
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
#include "suggest/policyimpl/dictionary/utils/entry_counters.h"
#include "utils/int_array_view.h"

namespace latinime {
@@ -75,8 +76,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
                      &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy),
              mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
              mWritingHelper(mBuffers.get()),
              mUnigramCount(mHeaderPolicy->getUnigramCount()),
              mBigramCount(mHeaderPolicy->getBigramCount()),
              mEntryCounters(mHeaderPolicy->getUnigramCount(), mHeaderPolicy->getBigramCount(),
                      mHeaderPolicy->getTrigramCount()),
              mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};

    virtual int getRootPosition() const {
@@ -163,8 +164,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
    Ver4PatriciaTrieNodeWriter mNodeWriter;
    DynamicPtUpdatingHelper mUpdatingHelper;
    Ver4PatriciaTrieWritingHelper mWritingHelper;
    int mUnigramCount;
    int mBigramCount;
    MutableEntryCounters mEntryCounters;
    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
    mutable bool mIsCorrupted;

+6 −5
Original line number Diff line number Diff line
@@ -43,18 +43,18 @@ namespace backward {
namespace v402 {

bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
        const int unigramCount, const int bigramCount) const {
        const EntryCounts &entryCounts) const {
    const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
    BufferWithExtendableBuffer headerBuffer(
            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
    const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
            + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
    if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
            unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
            entryCounts, extendedRegionSize, &headerBuffer)) {
        AKLOGE("Cannot write header structure to buffer. "
                "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
                "extendedRegionSize: %d", false, unigramCount, bigramCount,
                extendedRegionSize);
                "extendedRegionSize: %d", false, entryCounters.getUnigramCount(),
                entryCounters.getBigramCount(), extendedRegionSize);
        return false;
    }
    return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
@@ -74,7 +74,8 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr
    BufferWithExtendableBuffer headerBuffer(
            BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
    if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
            unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
            EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */),
            0 /* extendedRegionSize */, &headerBuffer)) {
        return false;
    }
    return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
Loading