Merge "Fix dicttool build" am: a92d0f9262 (6aed0523) · Commits · e / os / android_packages_inputmethods_LatinIME

java/src/com/android/inputmethod/latin/makedict/FormatSpec.java

+3 −0

Original line number	Diff line number	Diff line
		@@ -174,6 +174,9 @@ public final class FormatSpec {
		public static final int VERSION202 = 202;
		// format version for Fava Dictionaries.
		public static final int VERSION_DELIGHT3 = 86736212;
		public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201;
		// Dictionary version used for testing.
		public static final int VERSION4_ONLY_FOR_TESTING = 399;
		public static final int VERSION402 = 402;
		public static final int VERSION403 = 403;
		public static final int VERSION4 = VERSION403;

tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java

0 → 100644

+677 −0

File added.

Preview size limit exceeded, changes collapsed.

tests/src/com/android/inputmethod/latin/makedict/BinaryDictUtils.java

0 → 100644

+80 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		package com.android.inputmethod.latin.makedict;

		import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
		import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;

		import java.io.File;
		import java.util.HashMap;

		public class BinaryDictUtils {
		public static final int USE_BYTE_ARRAY = 1;
		public static final int USE_BYTE_BUFFER = 2;

		public static final String TEST_DICT_FILE_EXTENSION = ".testDict";

		public static final FormatSpec.FormatOptions STATIC_OPTIONS =
		new FormatSpec.FormatOptions(FormatSpec.VERSION202);
		public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP =
		new FormatSpec.FormatOptions(FormatSpec.VERSION4, false /* hasTimestamp */);
		public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITH_TIMESTAMP =
		new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */);

		public static DictionaryOptions makeDictionaryOptions(final String id, final String version,
		final FormatSpec.FormatOptions formatOptions) {
		final DictionaryOptions options = new DictionaryOptions(new HashMap<String, String>());
		options.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, "en_US");
		options.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, id);
		options.mAttributes.put(DictionaryHeader.DICTIONARY_VERSION_KEY, version);
		if (formatOptions.mHasTimestamp) {
		options.mAttributes.put(DictionaryHeader.HAS_HISTORICAL_INFO_KEY,
		DictionaryHeader.ATTRIBUTE_VALUE_TRUE);
		options.mAttributes.put(DictionaryHeader.USES_FORGETTING_CURVE_KEY,
		DictionaryHeader.ATTRIBUTE_VALUE_TRUE);
		}
		return options;
		}

		public static File getDictFile(final String name, final String version,
		final FormatOptions formatOptions, final File directory) {
		if (formatOptions.mVersion == FormatSpec.VERSION2
		\|\| formatOptions.mVersion == FormatSpec.VERSION201
		\|\| formatOptions.mVersion == FormatSpec.VERSION202) {
		return new File(directory, name + "." + version + TEST_DICT_FILE_EXTENSION);
		} else if (formatOptions.mVersion == FormatSpec.VERSION4) {
		return new File(directory, name + "." + version);
		} else {
		throw new RuntimeException("the format option has a wrong version : "
		+ formatOptions.mVersion);
		}
		}

		public static DictEncoder getDictEncoder(final File file, final FormatOptions formatOptions) {
		if (formatOptions.mVersion == FormatSpec.VERSION4) {
		if (!file.isDirectory()) {
		file.mkdir();
		}
		return new Ver4DictEncoder(file);
		} else if (formatOptions.mVersion == FormatSpec.VERSION202) {
		return new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_OFF);
		} else {
		throw new RuntimeException("The format option has a wrong version : "
		+ formatOptions.mVersion);
		}
		}
		}

tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java

0 → 100644

+279 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		package com.android.inputmethod.latin.makedict;

		import com.android.inputmethod.annotations.UsedForTesting;
		import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
		import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
		import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
		import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
		import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;

		import java.io.File;
		import java.io.FileNotFoundException;
		import java.io.FileOutputStream;
		import java.io.IOException;
		import java.io.OutputStream;
		import java.util.ArrayList;
		import java.util.Collections;
		import java.util.Comparator;
		import java.util.HashMap;
		import java.util.Iterator;
		import java.util.Map.Entry;

		/**
		* An implementation of DictEncoder for version 2 binary dictionary.
		*/
		@UsedForTesting
		public class Ver2DictEncoder implements DictEncoder {

		private final File mDictFile;
		private OutputStream mOutStream;
		private byte[] mBuffer;
		private int mPosition;
		private final int mCodePointTableMode;
		public static final int CODE_POINT_TABLE_OFF = 0;
		public static final int CODE_POINT_TABLE_ON = 1;

		@UsedForTesting
		public Ver2DictEncoder(final File dictFile, final int codePointTableMode) {
		mDictFile = dictFile;
		mOutStream = null;
		mBuffer = null;
		mCodePointTableMode = codePointTableMode;
		}

		// This constructor is used only by BinaryDictOffdeviceUtilsTests.
		// If you want to use this in the production code, you should consider keeping consistency of
		// the interface of Ver3DictDecoder by using factory.
		@UsedForTesting
		public Ver2DictEncoder(final OutputStream outStream) {
		mDictFile = null;
		mOutStream = outStream;
		mCodePointTableMode = CODE_POINT_TABLE_OFF;
		}

		private void openStream() throws FileNotFoundException {
		mOutStream = new FileOutputStream(mDictFile);
		}

		private void close() throws IOException {
		if (mOutStream != null) {
		mOutStream.close();
		mOutStream = null;
		}
		}

		// Package for testing
		static CodePointTable makeCodePointTable(final FusionDictionary dict) {
		final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
		for (final WordProperty word : dict) {
		// Store per code point occurrence
		final String wordString = word.mWord;
		for (int i = 0; i < wordString.length(); ++i) {
		final int codePoint = Character.codePointAt(wordString, i);
		if (codePointOccurrenceCounts.containsKey(codePoint)) {
		codePointOccurrenceCounts.put(codePoint,
		codePointOccurrenceCounts.get(codePoint) + 1);
		} else {
		codePointOccurrenceCounts.put(codePoint, 1);
		}
		}
		}
		final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
		new ArrayList<>(codePointOccurrenceCounts.entrySet());
		// Descending order sort by occurrence (value side)
		Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
		@Override
		public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
		if (a.getValue() != b.getValue()) {
		return b.getValue().compareTo(a.getValue());
		}
		return b.getKey().compareTo(a.getKey());
		}
		});
		int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
		// Temporary map for writing of nodes
		final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
		for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
		// Put a relation from the original code point to the one byte code.
		codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
		if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
		break;
		}
		}
		// codePointToOneByteCodeMap for writing the trie
		// codePointOccurrenceArray for writing the header
		return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
		}

		@Override
		public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
		throws IOException, UnsupportedFormatException {
		// We no longer support anything but the latest version of v2.
		if (formatOptions.mVersion != FormatSpec.VERSION202) {
		throw new UnsupportedFormatException(
		"The given format options has wrong version number : "
		+ formatOptions.mVersion);
		}

		if (mOutStream == null) {
		openStream();
		}

		// Make code point conversion table ordered by occurrence of code points
		// Version 201 or later have codePointTable
		final CodePointTable codePointTable;
		if (mCodePointTableMode == CODE_POINT_TABLE_OFF \|\| formatOptions.mVersion
		< FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
		codePointTable = new CodePointTable();
		} else {
		codePointTable = makeCodePointTable(dict);
		}

		BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
		codePointTable.mCodePointOccurrenceArray);

		// Addresses are limited to 3 bytes, but since addresses can be relative to each node
		// array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
		// the order of the PtNode arrays becomes a quite complicated problem, because though the
		// dictionary itself does not have a size limit, each node array must still be within 16MB
		// of all its children and parents. As long as this is ensured, the dictionary file may
		// grow to any size.

		// Leave the choice of the optimal node order to the flattenTree function.
		MakedictLog.i("Flattening the tree...");
		ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);

		MakedictLog.i("Computing addresses...");
		BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
		codePointTable.mCodePointToOneByteCodeMap);
		MakedictLog.i("Checking PtNode array...");
		if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

		// Create a buffer that matches the final dictionary size.
		final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
		final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
		mBuffer = new byte[bufferSize];

		MakedictLog.i("Writing file...");

		for (PtNodeArray nodeArray : flatNodes) {
		BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
		codePointTable.mCodePointToOneByteCodeMap);
		}
		if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
		mOutStream.write(mBuffer, 0, mPosition);

		MakedictLog.i("Done");
		close();
		}

		@Override
		public void setPosition(final int position) {
		if (mBuffer == null \|\| position < 0 \|\| position >= mBuffer.length) return;
		mPosition = position;
		}

		@Override
		public int getPosition() {
		return mPosition;
		}

		@Override
		public void writePtNodeCount(final int ptNodeCount) {
		final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
		if (countSize != 1 && countSize != 2) {
		throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
		}
		final int encodedPtNodeCount = (countSize == 2) ?
		(ptNodeCount \| FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
		mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
		countSize);
		}

		private void writePtNodeFlags(final PtNode ptNode,
		final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
		final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
		codePointToOneByteCodeMap);
		mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
		BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
		FormatSpec.PTNODE_FLAGS_SIZE);
		}

		private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
		final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
		mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
		codePointToOneByteCodeMap);
		if (hasSeveralChars) {
		mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
		}
		}

		private void writeFrequency(final int frequency) {
		if (frequency >= 0) {
		mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
		FormatSpec.PTNODE_FREQUENCY_SIZE);
		}
		}

		private void writeChildrenPosition(final PtNode ptNode,
		final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
		final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
		codePointToOneByteCodeMap);
		mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
		childrenPos);
		}

		/**
		* Write a bigram attributes list to mBuffer.
		*
		* @param bigrams the bigram attributes list.
		* @param dict the dictionary the node array is a part of (for relative offsets).
		*/
		private void writeBigrams(final ArrayList<WeightedString> bigrams,
		final FusionDictionary dict) {
		if (bigrams == null) return;

		final Iterator<WeightedString> bigramIterator = bigrams.iterator();
		while (bigramIterator.hasNext()) {
		final WeightedString bigram = bigramIterator.next();
		final PtNode target =
		FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
		final int addressOfBigram = target.mCachedAddressAfterUpdate;
		final int unigramFrequencyForThisWord = target.getProbability();
		final int offset = addressOfBigram
		- (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
		final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
		offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
		mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
		FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
		mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
		Math.abs(offset));
		}
		}

		@Override
		public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
		final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
		writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
		writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
		writeFrequency(ptNode.getProbability());
		writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
		writeBigrams(ptNode.mBigrams, dict);
		}
		}

tests/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java

0 → 100644

+133 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		package com.android.inputmethod.latin.makedict;

		import com.android.inputmethod.annotations.UsedForTesting;
		import com.android.inputmethod.latin.BinaryDictionary;
		import com.android.inputmethod.latin.Dictionary;
		import com.android.inputmethod.latin.NgramContext;
		import com.android.inputmethod.latin.common.LocaleUtils;
		import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
		import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
		import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;

		import java.io.File;
		import java.io.IOException;
		import java.util.HashMap;

		/**
		* An implementation of DictEncoder for version 4 binary dictionary.
		*/
		@UsedForTesting
		public class Ver4DictEncoder implements DictEncoder {
		private final File mDictPlacedDir;

		@UsedForTesting
		public Ver4DictEncoder(final File dictPlacedDir) {
		mDictPlacedDir = dictPlacedDir;
		}

		// TODO: This builds a FusionDictionary first and iterates it to add words to the binary
		// dictionary. However, it is possible to just add words directly to the binary dictionary
		// instead.
		// In the long run, when we stop supporting version 2, FusionDictionary will become deprecated
		// and we can remove it. Then we'll be able to just call BinaryDictionary directly.
		@Override
		public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions)
		throws IOException, UnsupportedFormatException {
		if (formatOptions.mVersion != FormatSpec.VERSION4) {
		throw new UnsupportedFormatException("File header has a wrong version number : "
		+ formatOptions.mVersion);
		}
		if (!mDictPlacedDir.isDirectory()) {
		throw new UnsupportedFormatException("Given path is not a directory.");
		}
		if (!BinaryDictionaryUtils.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(),
		FormatSpec.VERSION4, LocaleUtils.constructLocaleFromString(
		dict.mOptions.mAttributes.get(DictionaryHeader.DICTIONARY_LOCALE_KEY)),
		dict.mOptions.mAttributes)) {
		throw new IOException("Cannot create dictionary file : "
		+ mDictPlacedDir.getAbsolutePath());
		}
		final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(),
		0l, mDictPlacedDir.length(), true /* useFullEditDistance */,
		LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get(
		DictionaryHeader.DICTIONARY_LOCALE_KEY)),
		Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */,
		true /* isUpdatable */);
		if (!binaryDict.isValidDictionary()) {
		// Somehow createEmptyDictFile returned true, but the file was not created correctly
		throw new IOException("Cannot create dictionary file");
		}
		for (final WordProperty wordProperty : dict) {
		if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(),
		wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
		wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
		MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord);
		}
		if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
		if (!binaryDict.flushWithGC()) {
		MakedictLog.e("Cannot flush dict with GC.");
		return;
		}
		}
		}
		for (final WordProperty word0Property : dict) {
		if (!word0Property.mHasNgrams) continue;
		// TODO: Support ngram.
		for (final WeightedString word1 : word0Property.getBigrams()) {
		final NgramContext ngramContext =
		new NgramContext(new NgramContext.WordInfo(word0Property.mWord));
		if (!binaryDict.addNgramEntry(ngramContext, word1.mWord,
		word1.getProbability(), 0 /* timestamp */)) {
		MakedictLog.e("Cannot add n-gram entry for "
		+ ngramContext + " -> " + word1.mWord);
		return;
		}
		if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
		if (!binaryDict.flushWithGC()) {
		MakedictLog.e("Cannot flush dict with GC.");
		return;
		}
		}
		}
		}
		if (!binaryDict.flushWithGC()) {
		MakedictLog.e("Cannot flush dict with GC.");
		return;
		}
		binaryDict.close();
		}

		@Override
		public void setPosition(int position) {
		}

		@Override
		public int getPosition() {
		return 0;
		}

		@Override
		public void writePtNodeCount(int ptNodeCount) {
		}

		@Override
		public void writePtNode(PtNode ptNode, FusionDictionary dict,
		HashMap<Integer, Integer> codePointToOneByteCodeMap) {
		}
		}