Merge "Ignore bigrams that are not also listed as unigrams" into jb-dev (329c8d7b) · Commits · e / os / android_packages_inputmethods_LatinIME

java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java

+1 −1

Original line number	Diff line number	Diff line
		@@ -159,7 +159,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
		// TODO: Create "cache dictionary" to cache fresh words for frequently updated dictionaries,
		// considering performance regression.
		protected void addWord(final String word, final int frequency) {
		mFusionDictionary.add(word, frequency, null, null);
		mFusionDictionary.add(word, frequency, null /* shortcutTargets */);
		}

		/**

java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java

+10 −2

Original line number	Diff line number	Diff line
		@@ -1317,8 +1317,16 @@ public class BinaryDictInputOutput {
		0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
		0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
		if (null != dict) {
		for (Word w : dict) {
		newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams);
		for (final Word w : dict) {
		newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
		}
		for (final Word w : dict) {
		// By construction a binary dictionary may not have bigrams pointing to
		// words that are not also registered as unigrams so we don't have to avoid
		// them explicitly here.
		for (final WeightedString bigram : w.mBigrams) {
		newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
		}
		}
		}

java/src/com/android/inputmethod/latin/makedict/FusionDictionary.java

+18 −23

Original line number	Diff line number	Diff line
		@@ -286,7 +286,7 @@ public class FusionDictionary implements Iterable<Word> {
		for (WeightedString word : words) {
		final CharGroup t = findWordInTree(mRoot, word.mWord);
		if (null == t) {
		add(getCodePoints(word.mWord), 0, null, null);
		add(getCodePoints(word.mWord), 0, null);
		}
		}
		}
		@@ -305,12 +305,8 @@ public class FusionDictionary implements Iterable<Word> {
		* @param bigrams a list of bigrams, or null.
		*/
		public void add(final String word, final int frequency,
		final ArrayList<WeightedString> shortcutTargets,
		final ArrayList<WeightedString> bigrams) {
		if (null != bigrams) {
		addNeutralWords(bigrams);
		}
		add(getCodePoints(word), frequency, shortcutTargets, bigrams);
		final ArrayList<WeightedString> shortcutTargets) {
		add(getCodePoints(word), frequency, shortcutTargets);
		}

		/**
		@@ -344,7 +340,7 @@ public class FusionDictionary implements Iterable<Word> {
		final CharGroup charGroup2 = findWordInTree(mRoot, word2);
		if (charGroup2 == null) {
		// TODO: refactor with the identical code in addNeutralWords
		add(getCodePoints(word2), 0, null, null);
		add(getCodePoints(word2), 0, null);
		}
		charGroup.addBigram(word2, frequency);
		} else {
		@@ -355,17 +351,15 @@ public class FusionDictionary implements Iterable<Word> {
		/**
		* Add a word to this dictionary.
		*
		* The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't,
		* The shortcuts, if any, have to be in the dictionary already. If they aren't,
		* an exception is thrown.
		*
		* @param word the word, as an int array.
		* @param frequency the frequency of the word, in the range [0..255].
		* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
		* @param bigrams an optional list of bigrams for this word (null if none).
		*/
		private void add(final int[] word, final int frequency,
		final ArrayList<WeightedString> shortcutTargets,
		final ArrayList<WeightedString> bigrams) {
		final ArrayList<WeightedString> shortcutTargets) {
		assert(frequency >= 0 && frequency <= 255);
		Node currentNode = mRoot;
		int charIndex = 0;
		@@ -390,7 +384,7 @@ public class FusionDictionary implements Iterable<Word> {
		final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
		final CharGroup newGroup = new CharGroup(
		Arrays.copyOfRange(word, charIndex, word.length),
		shortcutTargets, bigrams, frequency);
		shortcutTargets, null /* bigrams */, frequency);
		currentNode.mData.add(insertionIndex, newGroup);
		checkStack(currentNode);
		} else {
		@@ -400,21 +394,21 @@ public class FusionDictionary implements Iterable<Word> {
		// The new word is a prefix of an existing word, but the node on which it
		// should end already exists as is. Since the old CharNode was not a terminal,
		// make it one by filling in its frequency and other attributes
		currentGroup.update(frequency, shortcutTargets, bigrams);
		currentGroup.update(frequency, shortcutTargets, null);
		} else {
		// The new word matches the full old word and extends past it.
		// We only have to create a new node and add it to the end of this.
		final CharGroup newNode = new CharGroup(
		Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
		shortcutTargets, bigrams, frequency);
		shortcutTargets, null /* bigrams */, frequency);
		currentGroup.mChildren = new Node();
		currentGroup.mChildren.mData.add(newNode);
		}
		} else {
		if (0 == differentCharIndex) {
		// Exact same word. Update the frequency if higher. This will also add the
		// new bigrams to the existing bigram list if it already exists.
		currentGroup.update(frequency, shortcutTargets, bigrams);
		// new shortcuts to the existing shortcut list if it already exists.
		currentGroup.update(frequency, shortcutTargets, null);
		} else {
		// Partial prefix match only. We have to replace the current node with a node
		// containing the current prefix and create two new ones for the tails.
		@@ -429,14 +423,14 @@ public class FusionDictionary implements Iterable<Word> {
		if (charIndex + differentCharIndex >= word.length) {
		newParent = new CharGroup(
		Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
		shortcutTargets, bigrams, frequency, newChildren);
		shortcutTargets, null /* bigrams */, frequency, newChildren);
		} else {
		newParent = new CharGroup(
		Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
		null, null, -1, newChildren);
		final CharGroup newWord = new CharGroup(
		Arrays.copyOfRange(word, charIndex + differentCharIndex,
		word.length), shortcutTargets, bigrams, frequency);
		null /* shortcutTargets /, null / bigrams */, -1, newChildren);
		final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
		charIndex + differentCharIndex, word.length),
		shortcutTargets, null /* bigrams */, frequency);
		final int addIndex = word[charIndex + differentCharIndex]
		> currentGroup.mChars[differentCharIndex] ? 1 : 0;
		newChildren.mData.add(addIndex, newWord);
		@@ -494,7 +488,8 @@ public class FusionDictionary implements Iterable<Word> {
		*/
		private static int findInsertionIndex(final Node node, int character) {
		final ArrayList<CharGroup> data = node.mData;
		final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0);
		final CharGroup reference = new CharGroup(new int[] { character },
		null /* shortcutTargets /, null / bigrams */, 0);
		int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
		return result >= 0 ? result : -result - 1;
		}

tools/makedict/src/com/android/inputmethod/latin/makedict/XmlDictInputOutput.java

+17 −10

Original line number	Diff line number	Diff line
		@@ -72,19 +72,15 @@ public class XmlDictInputOutput {
		int mFreq; // the currently read freq
		String mWord; // the current word
		final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
		final HashMap<String, ArrayList<WeightedString>> mBigramsMap;

		/**
		* Create the handler.
		*
		* @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
		* @param bigrams the bigrams as a map. This may be empty, but may not be null.
		*/
		public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts,
		final HashMap<String, ArrayList<WeightedString>> bigrams) {
		public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
		mDictionary = null;
		mShortcutsMap = shortcuts;
		mBigramsMap = bigrams;
		mWord = "";
		mState = START;
		mFreq = 0;
		@@ -94,7 +90,6 @@ public class XmlDictInputOutput {
		final FusionDictionary dict = mDictionary;
		mDictionary = null;
		mShortcutsMap.clear();
		mBigramsMap.clear();
		mWord = "";
		mState = START;
		mFreq = 0;
		@@ -143,7 +138,7 @@ public class XmlDictInputOutput {
		@Override
		public void endElement(String uri, String localName, String qName) {
		if (WORD == mState) {
		mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), mBigramsMap.get(mWord));
		mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord));
		mState = START;
		}
		}
		@@ -191,6 +186,7 @@ public class XmlDictInputOutput {
		}
		}

		// This may return an empty map, but will never return null.
		public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
		return mAssocMap;
		}
		@@ -211,6 +207,7 @@ public class XmlDictInputOutput {
		BIGRAM_FREQ_ATTRIBUTE);
		}

		// As per getAssocMap(), this never returns null.
		public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
		return getAssocMap();
		}
		@@ -231,6 +228,7 @@ public class XmlDictInputOutput {
		TARGET_PRIORITY_ATTRIBUTE);
		}

		// As per getAssocMap(), this never returns null.
		public HashMap<String, ArrayList<WeightedString>> getShortcutMap() {
		return getAssocMap();
		}
		@@ -260,10 +258,19 @@ public class XmlDictInputOutput {
		if (null != shortcuts) parser.parse(shortcuts, shortcutHandler);

		final UnigramHandler unigramHandler =
		new UnigramHandler(shortcutHandler.getShortcutMap(),
		bigramHandler.getBigramMap());
		new UnigramHandler(shortcutHandler.getShortcutMap());
		parser.parse(unigrams, unigramHandler);
		return unigramHandler.getFinalDictionary();
		final FusionDictionary dict = unigramHandler.getFinalDictionary();
		final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
		for (final String firstWord : bigramMap.keySet()) {
		if (!dict.hasWord(firstWord)) continue;
		final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
		for (final WeightedString bigram : bigramList) {
		if (!dict.hasWord(bigram.mWord)) continue;
		dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
		}
		}
		return dict;
		}

		/**

tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java

+5 −5

Original line number	Diff line number	Diff line
		@@ -43,11 +43,11 @@ public class BinaryDictInputOutputTest extends TestCase {
		final FusionDictionary dict = new FusionDictionary(new Node(),
		new DictionaryOptions(new HashMap<String, String>(),
		false /* germanUmlautProcessing /, false / frenchLigatureProcessing */));
		dict.add("foo", 1, null, null);
		dict.add("fta", 1, null, null);
		dict.add("ftb", 1, null, null);
		dict.add("bar", 1, null, null);
		dict.add("fool", 1, null, null);
		dict.add("foo", 1, null);
		dict.add("fta", 1, null);
		dict.add("ftb", 1, null);
		dict.add("bar", 1, null);
		dict.add("fool", 1, null);
		final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot);
		assertEquals(4, result.size());
		while (!result.isEmpty()) {