Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9416c814 authored by Jean Chalard's avatar Jean Chalard
Browse files

Return the bigram frequency if available.

This concludes the work on bug#6313806.
Don't submit it before the dictionaries are suitably amended.

Bug: 6313806
Change-Id: Icfea45bd52bb9d8cc68ba2266f80640e3942bb7f
parent cfbb9d76
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -75,9 +75,9 @@
        <item></item>
        <!-- Modest : Suggestion whose normalized score is greater than this value
             will be subject to auto-correction. -->
        <item>0.22</item>
        <item>0.185</item>
        <!-- Aggressive -->
        <item>0.08</item>
        <item>0.067</item>
        <!-- Very Aggressive : Suggestion whose normalized score is greater than this value
             will be subject to auto-correction. -->
        <item>0</item>
+23 −9
Original line number Diff line number Diff line
@@ -520,19 +520,33 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
    return 0;
}

// This should probably return a probability in log space.
static inline int backoff(const int unigramFreq) {
    return unigramFreq;
    // For some reason, applying the backoff weight gives bad results in tests. To apply the
    // backoff weight, we divide the probability by 2, which in our storing format means
    // decreasing the score by 8.
    // TODO: figure out what's wrong with this.
    // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
}

// This returns a probability in log space.
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
        const uint8_t *bigramFilter, const int unigramFreq) {
    if (!bigramMap || !bigramFilter) return unigramFreq;
    if (!isInFilter(bigramFilter, position)) return unigramFreq;
    const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
    if (bigramFreq != bigramMap->end()) {
        // TODO: return the frequency in bigramFreq->second
        return unigramFreq;
    if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
    if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
    const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
    if (bigramFreqIt != bigramMap->end()) {
        const int bigramFreq = bigramFreqIt->second;
        // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
        // unigram frequency to be the median value of the 17th step from the top. A value of
        // 0 for the bigram frequency represents the middle of the 16th step from the top,
        // while a value of 15 represents the middle of the top step.
        // See makedict.BinaryDictInputOutput for details.
        const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
        return (int)(unigramFreq + bigramFreq * stepSize);
    } else {
        return unigramFreq;
        return backoff(unigramFreq);
    }
    // TODO: if the unigram frequency is used, compute the actual probability
}

} // namespace latinime
+3 −2
Original line number Diff line number Diff line
@@ -207,6 +207,7 @@ static inline void prof_out(void) {
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
#define MAX_FREQ 255
#define MAX_BIGRAM_FREQ 15

// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
@@ -225,8 +226,8 @@ static inline void prof_out(void) {
#define MULTIPLE_WORDS_DEMOTION_RATE 80
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6

#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185

#define MAX_DEPTH_MULTIPLIER 3