Loading native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +41 −17 Original line number Diff line number Diff line Loading @@ -18,6 +18,8 @@ #include <algorithm> #include "utils/ngram_utils.h" namespace latinime { // Note that these are corresponding definitions in Java side in DictionaryHeader. Loading @@ -28,9 +30,11 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; const char *const HeaderPolicy::TRIGRAM_COUNT_KEY = "TRIGRAM_COUNT"; const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT"}; const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT"}; const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000}; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. Loading @@ -39,18 +43,10 @@ const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT"; const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT"; const char *const HeaderPolicy::MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT"; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 30000; const int HeaderPolicy::DEFAULT_MAX_TRIGRAM_COUNT = 30000; // Used for logging. Question mark is used to indicate that the key is not found. void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const { Loading Loading @@ -126,15 +122,22 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim return true; } namespace { int getIndexFromNgramType(const NgramType ngramType) { return static_cast<int>(ngramType); } } // namespace void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, entryCounts.getUnigramCount()); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, entryCounts.getBigramCount()); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, TRIGRAM_COUNT_KEY, entryCounts.getTrigramCount()); for (const auto ngramType : AllNgramTypes::ASCENDING) { HeaderReadWriteUtils::setIntAttribute(outAttributeMap, NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], entryCounts.getNgramCount(ngramType)); } HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); // Set the current time as the generation time. Loading @@ -155,4 +158,25 @@ void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, return attributeMap; } /* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { MutableEntryCounters entryCounters; for (const auto ngramType : AllNgramTypes::ASCENDING) { const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); entryCounters.setNgramCount(ngramType, entryCount); } return entryCounters.getEntryCounts(); } /* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { MutableEntryCounters entryCounters; for (const auto ngramType : AllNgramTypes::ASCENDING) { const int index = getIndexFromNgramType(ngramType); const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); entryCounters.setNgramCount(ngramType, maxEntryCount); } return entryCounters.getEntryCounts(); } } // namespace latinime native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +18 −63 Original line number Diff line number Diff line Loading @@ -46,12 +46,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, BIGRAM_COUNT_KEY, 0 /* defaultValue */)), mTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, TRIGRAM_COUNT_KEY, 0 /* defaultValue */)), mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( Loading @@ -59,12 +54,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. Loading @@ -82,18 +71,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0), mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information Loading @@ -105,15 +89,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), mIsDecayingDict(headerPolicy->mIsDecayingDict), mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), mTrigramCount(headerPolicy->mTrigramCount), mNgramCounts(headerPolicy->mNgramCounts), mMaxNgramCounts(headerPolicy->mMaxNgramCounts), mExtendedRegionSize(headerPolicy->mExtendedRegionSize), mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), mForgettingCurveProbabilityValuesTableId( headerPolicy->mForgettingCurveProbabilityValuesTableId), mMaxUnigramCount(headerPolicy->mMaxUnigramCount), mMaxBigramCount(headerPolicy->mMaxBigramCount), mMaxTrigramCount(headerPolicy->mMaxTrigramCount), mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. Loading @@ -121,10 +102,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0), mMaxTrigramCount(0), mCodePointTable(nullptr) {} mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} ~HeaderPolicy() {} Loading Loading @@ -186,16 +166,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastDecayedTime; } AK_FORCE_INLINE int getUnigramCount() const { return mUnigramCount; AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { return mNgramCounts; } AK_FORCE_INLINE int getBigramCount() const { return mBigramCount; } AK_FORCE_INLINE int getTrigramCount() const { return mTrigramCount; AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { return mMaxNgramCounts; } AK_FORCE_INLINE int getExtendedRegionSize() const { Loading @@ -219,18 +195,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mForgettingCurveProbabilityValuesTableId; } AK_FORCE_INLINE int getMaxUnigramCount() const { return mMaxUnigramCount; } AK_FORCE_INLINE int getMaxBigramCount() const { return mMaxBigramCount; } AK_FORCE_INLINE int getMaxTrigramCount() const { return mMaxTrigramCount; } void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; Loading Loading @@ -262,24 +226,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const IS_DECAYING_DICT_KEY; static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; static const char *const TRIGRAM_COUNT_KEY; static const char *const NGRAM_COUNT_KEYS[]; static const char *const MAX_NGRAM_COUNT_KEYS[]; static const int DEFAULT_MAX_NGRAM_COUNTS[]; static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; static const char *const MAX_UNIGRAM_COUNT_KEY; static const char *const MAX_BIGRAM_COUNT_KEY; static const char *const MAX_TRIGRAM_COUNT_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; static const int DEFAULT_MAX_UNIGRAM_COUNT; static const int DEFAULT_MAX_BIGRAM_COUNT; static const int DEFAULT_MAX_TRIGRAM_COUNT; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; Loading @@ -291,21 +249,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const bool mIsDecayingDict; const int mDate; const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; const int mTrigramCount; const EntryCounts mNgramCounts; const EntryCounts mMaxNgramCounts; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; const int mForgettingCurveProbabilityValuesTableId; const int mMaxUnigramCount; const int mMaxBigramCount; const int mMaxTrigramCount; const int *const mCodePointTable; const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; bool readRequiresGermanUmlautProcessing() const; const EntryCounts readNgramCounts() const; const EntryCounts readMaxNgramCounts() const; static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; Loading native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +10 −7 Original line number Diff line number Diff line Loading @@ -303,7 +303,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePo if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mEntryCounters.incrementUnigramCount(); mEntryCounters.incrementNgramCount(NgramType::Unigram); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. Loading Loading @@ -397,7 +397,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramPrope if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos, ngramProperty, &addedNewBigram)) { if (addedNewBigram) { mEntryCounters.incrementBigramCount(); mEntryCounters.incrementNgramCount(NgramType::Bigram); } return true; } else { Loading Loading @@ -438,7 +438,7 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramCon const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); if (mUpdatingHelper.removeNgramEntry( PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { mEntryCounters.decrementBigramCount(); mEntryCounters.decrementNgramCount(NgramType::Bigram); return true; } else { return false; Loading Loading @@ -525,20 +525,23 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer char *const outResult, const int maxResultLength) { const int compareLength = queryLength + 1 /* terminator */; if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mEntryCounters.getUnigramCount()); snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Unigram)); } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mEntryCounters.getBigramCount()); snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxUnigramCount()) : mHeaderPolicy->getMaxNgramCounts().getNgramCount( NgramType::Unigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxBigramCount()) : mHeaderPolicy->getMaxNgramCounts().getNgramCount( NgramType::Bigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } } Loading native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +1 −2 Original line number Diff line number Diff line Loading @@ -76,8 +76,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), mEntryCounters(mHeaderPolicy->getUnigramCount(), mHeaderPolicy->getBigramCount(), mHeaderPolicy->getTrigramCount()), mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; virtual int getRootPosition() const { Loading native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +8 −6 Original line number Diff line number Diff line Loading @@ -53,8 +53,8 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPat entryCounts, extendedRegionSize, &headerBuffer)) { AKLOGE("Cannot write header structure to buffer. " "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " "extendedRegionSize: %d", false, entryCounts.getUnigramCount(), entryCounts.getBigramCount(), extendedRegionSize); "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); return false; } return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); Loading @@ -73,9 +73,11 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr } BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); MutableEntryCounters entryCounters; entryCounters.setNgramCount(NgramType::Unigram, unigramCount); entryCounters.setNgramCount(NgramType::Bigram, bigramCount); if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */), 0 /* extendedRegionSize */, &headerBuffer)) { entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { return false; } return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); Loading Loading @@ -107,7 +109,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, } const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted .getValidUnigramCount(); const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, Loading @@ -124,7 +126,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); const int maxBigramCount = headerPolicy->getMaxBigramCount(); const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { if (!truncateBigrams(maxBigramCount)) { AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); Loading Loading
native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +41 −17 Original line number Diff line number Diff line Loading @@ -18,6 +18,8 @@ #include <algorithm> #include "utils/ngram_utils.h" namespace latinime { // Note that these are corresponding definitions in Java side in DictionaryHeader. Loading @@ -28,9 +30,11 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; const char *const HeaderPolicy::TRIGRAM_COUNT_KEY = "TRIGRAM_COUNT"; const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT"}; const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT"}; const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000}; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. Loading @@ -39,18 +43,10 @@ const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT"; const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT"; const char *const HeaderPolicy::MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT"; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 30000; const int HeaderPolicy::DEFAULT_MAX_TRIGRAM_COUNT = 30000; // Used for logging. Question mark is used to indicate that the key is not found. void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const { Loading Loading @@ -126,15 +122,22 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim return true; } namespace { int getIndexFromNgramType(const NgramType ngramType) { return static_cast<int>(ngramType); } } // namespace void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, entryCounts.getUnigramCount()); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, entryCounts.getBigramCount()); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, TRIGRAM_COUNT_KEY, entryCounts.getTrigramCount()); for (const auto ngramType : AllNgramTypes::ASCENDING) { HeaderReadWriteUtils::setIntAttribute(outAttributeMap, NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], entryCounts.getNgramCount(ngramType)); } HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); // Set the current time as the generation time. Loading @@ -155,4 +158,25 @@ void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, return attributeMap; } /* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { MutableEntryCounters entryCounters; for (const auto ngramType : AllNgramTypes::ASCENDING) { const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); entryCounters.setNgramCount(ngramType, entryCount); } return entryCounters.getEntryCounts(); } /* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { MutableEntryCounters entryCounters; for (const auto ngramType : AllNgramTypes::ASCENDING) { const int index = getIndexFromNgramType(ngramType); const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); entryCounters.setNgramCount(ngramType, maxEntryCount); } return entryCounters.getEntryCounts(); } } // namespace latinime
native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +18 −63 Original line number Diff line number Diff line Loading @@ -46,12 +46,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, BIGRAM_COUNT_KEY, 0 /* defaultValue */)), mTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, TRIGRAM_COUNT_KEY, 0 /* defaultValue */)), mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( Loading @@ -59,12 +54,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. Loading @@ -82,18 +71,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0), mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information Loading @@ -105,15 +89,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), mIsDecayingDict(headerPolicy->mIsDecayingDict), mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), mTrigramCount(headerPolicy->mTrigramCount), mNgramCounts(headerPolicy->mNgramCounts), mMaxNgramCounts(headerPolicy->mMaxNgramCounts), mExtendedRegionSize(headerPolicy->mExtendedRegionSize), mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), mForgettingCurveProbabilityValuesTableId( headerPolicy->mForgettingCurveProbabilityValuesTableId), mMaxUnigramCount(headerPolicy->mMaxUnigramCount), mMaxBigramCount(headerPolicy->mMaxBigramCount), mMaxTrigramCount(headerPolicy->mMaxTrigramCount), mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. Loading @@ -121,10 +102,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0), mMaxTrigramCount(0), mCodePointTable(nullptr) {} mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} ~HeaderPolicy() {} Loading Loading @@ -186,16 +166,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastDecayedTime; } AK_FORCE_INLINE int getUnigramCount() const { return mUnigramCount; AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { return mNgramCounts; } AK_FORCE_INLINE int getBigramCount() const { return mBigramCount; } AK_FORCE_INLINE int getTrigramCount() const { return mTrigramCount; AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { return mMaxNgramCounts; } AK_FORCE_INLINE int getExtendedRegionSize() const { Loading @@ -219,18 +195,6 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mForgettingCurveProbabilityValuesTableId; } AK_FORCE_INLINE int getMaxUnigramCount() const { return mMaxUnigramCount; } AK_FORCE_INLINE int getMaxBigramCount() const { return mMaxBigramCount; } AK_FORCE_INLINE int getMaxTrigramCount() const { return mMaxTrigramCount; } void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; Loading Loading @@ -262,24 +226,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const IS_DECAYING_DICT_KEY; static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; static const char *const TRIGRAM_COUNT_KEY; static const char *const NGRAM_COUNT_KEYS[]; static const char *const MAX_NGRAM_COUNT_KEYS[]; static const int DEFAULT_MAX_NGRAM_COUNTS[]; static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; static const char *const MAX_UNIGRAM_COUNT_KEY; static const char *const MAX_BIGRAM_COUNT_KEY; static const char *const MAX_TRIGRAM_COUNT_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; static const int DEFAULT_MAX_UNIGRAM_COUNT; static const int DEFAULT_MAX_BIGRAM_COUNT; static const int DEFAULT_MAX_TRIGRAM_COUNT; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; Loading @@ -291,21 +249,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const bool mIsDecayingDict; const int mDate; const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; const int mTrigramCount; const EntryCounts mNgramCounts; const EntryCounts mMaxNgramCounts; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; const int mForgettingCurveProbabilityValuesTableId; const int mMaxUnigramCount; const int mMaxBigramCount; const int mMaxTrigramCount; const int *const mCodePointTable; const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; bool readRequiresGermanUmlautProcessing() const; const EntryCounts readNgramCounts() const; const EntryCounts readMaxNgramCounts() const; static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; Loading
native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +10 −7 Original line number Diff line number Diff line Loading @@ -303,7 +303,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePo if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mEntryCounters.incrementUnigramCount(); mEntryCounters.incrementNgramCount(NgramType::Unigram); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. Loading Loading @@ -397,7 +397,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramPrope if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos, ngramProperty, &addedNewBigram)) { if (addedNewBigram) { mEntryCounters.incrementBigramCount(); mEntryCounters.incrementNgramCount(NgramType::Bigram); } return true; } else { Loading Loading @@ -438,7 +438,7 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramCon const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); if (mUpdatingHelper.removeNgramEntry( PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { mEntryCounters.decrementBigramCount(); mEntryCounters.decrementNgramCount(NgramType::Bigram); return true; } else { return false; Loading Loading @@ -525,20 +525,23 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer char *const outResult, const int maxResultLength) { const int compareLength = queryLength + 1 /* terminator */; if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mEntryCounters.getUnigramCount()); snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Unigram)); } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mEntryCounters.getBigramCount()); snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxUnigramCount()) : mHeaderPolicy->getMaxNgramCounts().getNgramCount( NgramType::Unigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxBigramCount()) : mHeaderPolicy->getMaxNgramCounts().getNgramCount( NgramType::Bigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } } Loading
native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +1 −2 Original line number Diff line number Diff line Loading @@ -76,8 +76,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), mEntryCounters(mHeaderPolicy->getUnigramCount(), mHeaderPolicy->getBigramCount(), mHeaderPolicy->getTrigramCount()), mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; virtual int getRootPosition() const { Loading
native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +8 −6 Original line number Diff line number Diff line Loading @@ -53,8 +53,8 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPat entryCounts, extendedRegionSize, &headerBuffer)) { AKLOGE("Cannot write header structure to buffer. " "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " "extendedRegionSize: %d", false, entryCounts.getUnigramCount(), entryCounts.getBigramCount(), extendedRegionSize); "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); return false; } return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); Loading @@ -73,9 +73,11 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr } BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); MutableEntryCounters entryCounters; entryCounters.setNgramCount(NgramType::Unigram, unigramCount); entryCounters.setNgramCount(NgramType::Bigram, bigramCount); if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */), 0 /* extendedRegionSize */, &headerBuffer)) { entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { return false; } return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); Loading Loading @@ -107,7 +109,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, } const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted .getValidUnigramCount(); const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, Loading @@ -124,7 +126,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); const int maxBigramCount = headerPolicy->getMaxBigramCount(); const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { if (!truncateBigrams(maxBigramCount)) { AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); Loading