Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 329c8d7b authored by Jean Chalard's avatar Jean Chalard Committed by Android (Google) Code Review
Browse files

Merge "Ignore bigrams that are not also listed as unigrams" into jb-dev

parents 604599c3 44c64f46
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
    // TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
    // considering performance regression.
    protected void addWord(final String word, final int frequency) {
        mFusionDictionary.add(word, frequency, null, null);
        mFusionDictionary.add(word, frequency, null /* shortcutTargets */);
    }

    /**
+10 −2
Original line number Diff line number Diff line
@@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput {
                        0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
                        0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
        if (null != dict) {
            for (Word w : dict) {
                newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams);
            for (final Word w : dict) {
                newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
            }
            for (final Word w : dict) {
                // By construction a binary dictionary may not have bigrams pointing to
                // words that are not also registered as unigrams so we don't have to avoid
                // them explicitly here.
                for (final WeightedString bigram : w.mBigrams) {
                    newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
                }
            }
        }

+18 −23
Original line number Diff line number Diff line
@@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
            for (WeightedString word : words) {
                final CharGroup t = findWordInTree(mRoot, word.mWord);
                if (null == t) {
                    add(getCodePoints(word.mWord), 0, null, null);
                    add(getCodePoints(word.mWord), 0, null);
                }
            }
        }
@@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable<Word> {
     * @param bigrams a list of bigrams, or null.
     */
    public void add(final String word, final int frequency,
            final ArrayList<WeightedString> shortcutTargets,
            final ArrayList<WeightedString> bigrams) {
        if (null != bigrams) {
            addNeutralWords(bigrams);
        }
        add(getCodePoints(word), frequency, shortcutTargets, bigrams);
            final ArrayList<WeightedString> shortcutTargets) {
        add(getCodePoints(word), frequency, shortcutTargets);
    }

    /**
@@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable<Word> {
            final CharGroup charGroup2 = findWordInTree(mRoot, word2);
            if (charGroup2 == null) {
                // TODO: refactor with the identical code in addNeutralWords
                add(getCodePoints(word2), 0, null, null);
                add(getCodePoints(word2), 0, null);
            }
            charGroup.addBigram(word2, frequency);
        } else {
@@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable<Word> {
    /**
     * Add a word to this dictionary.
     *
     * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't,
     * The shortcuts, if any, have to be in the dictionary already. If they aren't,
     * an exception is thrown.
     *
     * @param word the word, as an int array.
     * @param frequency the frequency of the word, in the range [0..255].
     * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
     * @param bigrams an optional list of bigrams for this word (null if none).
     */
    private void add(final int[] word, final int frequency,
            final ArrayList<WeightedString> shortcutTargets,
            final ArrayList<WeightedString> bigrams) {
            final ArrayList<WeightedString> shortcutTargets) {
        assert(frequency >= 0 && frequency <= 255);
        Node currentNode = mRoot;
        int charIndex = 0;
@@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable<Word> {
            final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
            final CharGroup newGroup = new CharGroup(
                    Arrays.copyOfRange(word, charIndex, word.length),
                    shortcutTargets, bigrams, frequency);
                    shortcutTargets, null /* bigrams */, frequency);
            currentNode.mData.add(insertionIndex, newGroup);
            checkStack(currentNode);
        } else {
@@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable<Word> {
                    // The new word is a prefix of an existing word, but the node on which it
                    // should end already exists as is. Since the old CharNode was not a terminal, 
                    // make it one by filling in its frequency and other attributes
                    currentGroup.update(frequency, shortcutTargets, bigrams);
                    currentGroup.update(frequency, shortcutTargets, null);
                } else {
                    // The new word matches the full old word and extends past it.
                    // We only have to create a new node and add it to the end of this.
                    final CharGroup newNode = new CharGroup(
                            Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
                                    shortcutTargets, bigrams, frequency);
                                    shortcutTargets, null /* bigrams */, frequency);
                    currentGroup.mChildren = new Node();
                    currentGroup.mChildren.mData.add(newNode);
                }
            } else {
                if (0 == differentCharIndex) {
                    // Exact same word. Update the frequency if higher. This will also add the
                    // new bigrams to the existing bigram list if it already exists.
                    currentGroup.update(frequency, shortcutTargets, bigrams);
                    // new shortcuts to the existing shortcut list if it already exists.
                    currentGroup.update(frequency, shortcutTargets, null);
                } else {
                    // Partial prefix match only. We have to replace the current node with a node
                    // containing the current prefix and create two new ones for the tails.
@@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable<Word> {
                    if (charIndex + differentCharIndex >= word.length) {
                        newParent = new CharGroup(
                                Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
                                shortcutTargets, bigrams, frequency, newChildren);
                                shortcutTargets, null /* bigrams */, frequency, newChildren);
                    } else {
                        newParent = new CharGroup(
                                Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
                                null, null, -1, newChildren);
                        final CharGroup newWord = new CharGroup(
                                Arrays.copyOfRange(word, charIndex + differentCharIndex,
                                        word.length), shortcutTargets, bigrams, frequency);
                                null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
                        final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
                                charIndex + differentCharIndex, word.length),
                                shortcutTargets, null /* bigrams */, frequency);
                        final int addIndex = word[charIndex + differentCharIndex]
                                > currentGroup.mChars[differentCharIndex] ? 1 : 0;
                        newChildren.mData.add(addIndex, newWord);
@@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable<Word> {
     */
    private static int findInsertionIndex(final Node node, int character) {
        final ArrayList<CharGroup> data = node.mData;
        final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0);
        final CharGroup reference = new CharGroup(new int[] { character },
                null /* shortcutTargets */, null /* bigrams */, 0);
        int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
        return result >= 0 ? result : -result - 1;
    }
+17 −10
Original line number Diff line number Diff line
@@ -72,19 +72,15 @@ public class XmlDictInputOutput {
        int mFreq; // the currently read freq
        String mWord; // the current word
        final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
        final HashMap<String, ArrayList<WeightedString>> mBigramsMap;

        /**
         * Create the handler.
         *
         * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
         * @param bigrams the bigrams as a map. This may be empty, but may not be null.
         */
        public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts,
                final HashMap<String, ArrayList<WeightedString>> bigrams) {
        public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
            mDictionary = null;
            mShortcutsMap = shortcuts;
            mBigramsMap = bigrams;
            mWord = "";
            mState = START;
            mFreq = 0;
@@ -94,7 +90,6 @@ public class XmlDictInputOutput {
            final FusionDictionary dict = mDictionary;
            mDictionary = null;
            mShortcutsMap.clear();
            mBigramsMap.clear();
            mWord = "";
            mState = START;
            mFreq = 0;
@@ -143,7 +138,7 @@ public class XmlDictInputOutput {
        @Override
        public void endElement(String uri, String localName, String qName) {
            if (WORD == mState) {
                mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
                mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
                mState = START;
            }
        }
@@ -191,6 +186,7 @@ public class XmlDictInputOutput {
            }
        }

        // This may return an empty map, but will never return null.
        public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
            return mAssocMap;
        }
@@ -211,6 +207,7 @@ public class XmlDictInputOutput {
                    BIGRAM_FREQ_ATTRIBUTE);
        }

        // As per getAssocMap(), this never returns null.
        public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
            return getAssocMap();
        }
@@ -231,6 +228,7 @@ public class XmlDictInputOutput {
                    TARGET_PRIORITY_ATTRIBUTE);
        }

        // As per getAssocMap(), this never returns null.
        public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
            return getAssocMap();
        }
@@ -260,10 +258,19 @@ public class XmlDictInputOutput {
        if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);

        final UnigramHandler unigramHandler =
                new UnigramHandler(shortcutHandler.getShortcutMap(),
                        bigramHandler.getBigramMap());
                new UnigramHandler(shortcutHandler.getShortcutMap());
        parser.parse(unigrams, unigramHandler);
        return unigramHandler.getFinalDictionary();
        final FusionDictionary dict = unigramHandler.getFinalDictionary();
        final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
        for (final String firstWord : bigramMap.keySet()) {
            if (!dict.hasWord(firstWord)) continue;
            final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
            for (final WeightedString bigram : bigramList) {
                if (!dict.hasWord(bigram.mWord)) continue;
                dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
            }
        }
        return dict;
    }

    /**
+5 −5
Original line number Diff line number Diff line
@@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
        final FusionDictionary dict = new FusionDictionary(new Node(),
                new DictionaryOptions(new HashMap<String, String>(),
                        false /* germanUmlautProcessing */, false /* frenchLigatureProcessing */));
        dict.add("foo", 1, null, null);
        dict.add("fta", 1, null, null);
        dict.add("ftb", 1, null, null);
        dict.add("bar", 1, null, null);
        dict.add("fool", 1, null, null);
        dict.add("foo", 1, null);
        dict.add("fta", 1, null);
        dict.add("ftb", 1, null);
        dict.add("bar", 1, null);
        dict.add("fool", 1, null);
        final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
        assertEquals(4, result.size());
        while (!result.isEmpty()) {