Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 04a492cb authored by Keisuke Kuroyanagi's avatar Keisuke Kuroyanagi Committed by Android (Google) Code Review
Browse files

Merge "Add DynamicLanguageModelProbabilityUtils."

parents 77947c9c 660b0047
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@ LATIN_IME_CORE_SRC_FILES := \
        ver4_patricia_trie_writing_helper.cpp \
        ver4_pt_node_array_reader.cpp) \
    $(addprefix suggest/policyimpl/dictionary/structure/v4/content/, \
        dynamic_language_model_probability_utils.cpp \
        language_model_dict_content.cpp \
        language_model_dict_content_global_counters.cpp \
        shortcut_dict_content.cpp \
+37 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2014, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "suggest/policyimpl/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"

namespace latinime {

// These counts are used to provide stable probabilities even if the user's input count is small.
const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_UNIGRAMS = 8192;
const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_BIGRAMS = 2;
const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNT_FOR_TRIGRAMS = 2;

// These are encoded backoff weights.
// Note that we give positive value for trigrams that means the weight is more than 1.
// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight.
const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS = -32;
const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS = 0;
const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS = 8;

// This value is used to remove too old entries from the dictionary.
const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS =
        300 * 24 * 60 * 60; // 300 days

} // namespace latinime
+114 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2014, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H

#include <algorithm>

#include "defines.h"
#include "suggest/core/dictionary/property/historical_info.h"
#include "utils/time_keeper.h"

namespace latinime {

class DynamicLanguageModelProbabilityUtils {
 public:
    static float computeRawProbabilityFromCounts(const int count, const int contextCount,
            const int matchedWordCountInContext) {
        int minCount = 0;
        switch (matchedWordCountInContext) {
            case 1:
                minCount = ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
                break;
            case 2:
                minCount = ASSUMED_MIN_COUNT_FOR_BIGRAMS;
                break;
            case 3:
                minCount = ASSUMED_MIN_COUNT_FOR_TRIGRAMS;
                break;
            default:
                AKLOGE("computeRawProbabilityFromCounts is called with invalid "
                        "matchedWordCountInContext (%d).", matchedWordCountInContext);
                ASSERT(false);
                return 0.0f;
        }
        return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount));
    }

    static float backoff(const int ngramProbability, const int matchedWordCountInContext) {
        int probability = NOT_A_PROBABILITY;

        switch (matchedWordCountInContext) {
            case 1:
                probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
                break;
            case 2:
                probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
                break;
            case 3:
                probability = ngramProbability + ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;
                break;
            default:
                AKLOGE("backoff is called with invalid matchedWordCountInContext (%d).",
                        matchedWordCountInContext);
                ASSERT(false);
                return NOT_A_PROBABILITY;
        }
        return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY);
    }

    static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) {
        const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
        if (elapsedTime < 0) {
            AKLOGE("The elapsed time is negatime value. Timestamp overflow?");
            return NOT_A_PROBABILITY;
        }
        // TODO: Improve this logic.
        // We don't modify probability depending on the elapsed time.
        return probability;
    }

    static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) {
        // TODO: Improve this logic.
        const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
        return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
    }

    static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) {
        // TODO: Improve this logic.
        // More recently input entries get higher priority.
        return historicalInfo.getTimestamp();
    }

private:
    DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils);

    static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 2, "Max supported Ngram is Trigram.");

    static const int ASSUMED_MIN_COUNT_FOR_UNIGRAMS;
    static const int ASSUMED_MIN_COUNT_FOR_BIGRAMS;
    static const int ASSUMED_MIN_COUNT_FOR_TRIGRAMS;

    static const int ENCODED_BACKOFF_WEIGHT_FOR_UNIGRAMS;
    static const int ENCODED_BACKOFF_WEIGHT_FOR_BIGRAMS;
    static const int ENCODED_BACKOFF_WEIGHT_FOR_TRIGRAMS;

    static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
};

} // namespace latinime
#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */