Loading java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java +1 −0 Original line number Diff line number Diff line Loading @@ -47,6 +47,7 @@ public final class DictionaryHeader { public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; public static final String CODE_POINT_TABLE_KEY = "codePointTable"; public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) throws UnsupportedFormatException { Loading java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +2 −0 Original line number Diff line number Diff line Loading @@ -237,6 +237,8 @@ public final class FormatSpec { static final int UINT16_MAX = 0xFFFF; static final int UINT24_MAX = 0xFFFFFF; static final int MSB8 = 0x80; static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Options about file format. Loading tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +19 −1 Original line number Diff line number Diff line Loading @@ -27,6 +27,8 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Map.Entry; /** * Encodes binary files for a FusionDictionary. Loading Loading @@ -791,10 +793,12 @@ public class BinaryDictEncoderUtils { * @param destination the stream to write the file header to. * @param dict the dictionary to write. * @param formatOptions file format options. * @param codePointOccurrenceArray code points ordered by occurrence count. * @return the size of the header. */ /* package */ static int writeDictionaryHeader(final OutputStream destination, final FusionDictionary dict, final FormatOptions formatOptions) final FusionDictionary dict, final FormatOptions formatOptions, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) throws IOException, UnsupportedFormatException { final int version = formatOptions.mVersion; if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION Loading Loading @@ -833,6 +837,9 @@ public class BinaryDictEncoderUtils { CharEncoding.writeString(headerBuffer, key); CharEncoding.writeString(headerBuffer, value); } // TODO: Write out the code point table. final int size = headerBuffer.size(); final byte[] bytes = headerBuffer.toByteArray(); // Write out the header size. Loading @@ -845,4 +852,15 @@ public class BinaryDictEncoderUtils { headerBuffer.close(); return size; } static final class CodePointTable { final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; mCodePointOccurrenceArray = codePointOccurrenceArray; } } } tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java +51 −1 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; Loading @@ -28,7 +29,11 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map.Entry; /** * An implementation of DictEncoder for version 2 binary dictionary. Loading Loading @@ -73,6 +78,46 @@ public class Ver2DictEncoder implements DictEncoder { } } // Package for testing static CodePointTable makeCodePointTable(final FusionDictionary dict) { final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); for (final WordProperty word : dict) { // Store per code point occurrence final String wordString = word.mWord; for (int i = 0; i < wordString.length(); ++i) { final int codePoint = Character.codePointAt(wordString, i); if (codePointOccurrenceCounts.containsKey(codePoint)) { codePointOccurrenceCounts.put(codePoint, codePointOccurrenceCounts.get(codePoint) + 1); } else { codePointOccurrenceCounts.put(codePoint, 1); } } } final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = new ArrayList<>(codePointOccurrenceCounts.entrySet()); // Descending order sort by occurrence (value side) Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { @Override public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { return b.getValue().compareTo(a.getValue()); } }); int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; // Temporary map for writing of nodes final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { // Put a relation from the original code point to the one byte code. codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { break; } } // codePointToOneByteCodeMap for writing the trie // codePointOccurrenceArray for writing the header return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); } @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { Loading @@ -85,7 +130,12 @@ public class Ver2DictEncoder implements DictEncoder { if (mOutStream == null) { openStream(); } BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); // Make code point conversion table ordered by occurrence of code points final CodePointTable codePointTable = makeCodePointTable(dict); BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, codePointTable.mCodePointOccurrenceArray); // Addresses are limited to 3 bytes, but since addresses can be relative to each node // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding Loading tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoderTests.java 0 → 100644 +91 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; import android.util.Log; /** * Unit tests for Ver2DictEncoder */ @LargeTest public class Ver2DictEncoderTests extends AndroidTestCase { private static final String TAG = Ver2DictEncoderTests.class.getSimpleName(); private static final int UNIGRAM_FREQ = 10; public void testCodePointTable() { final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"}; final List<String> words = Arrays.asList(wordSource); final String correctCodePointTable = "eotdsanirfg bclwup"; final String correctCodePointOccurrenceArrayString = "10141164111411531003110297210521142103111911171108198199132111211021"; final String correctCodePointExpectedMapString = "323433363538373940494147454644424348"; final String dictName = "codePointTableTest"; final String dictVersion = Long.toString(System.currentTimeMillis()); final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(FormatSpec.VERSION2); final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(sourcedict, words, null /* shortcutMap */); final CodePointTable codePointTable = Ver2DictEncoder.makeCodePointTable(sourcedict); // Check if mCodePointOccurrenceArray is correct final StringBuilder codePointOccurrenceArrayString = new StringBuilder(); for (Entry<Integer, Integer> entry : codePointTable.mCodePointOccurrenceArray) { codePointOccurrenceArrayString.append(entry.getKey()); codePointOccurrenceArrayString.append(entry.getValue()); } assertEquals(codePointOccurrenceArrayString.toString(), correctCodePointOccurrenceArrayString); // Check if mCodePointToOneByteCodeMap is correct final StringBuilder codePointExpectedMapString = new StringBuilder(); for (int i = 0; i < correctCodePointTable.length(); ++i) { codePointExpectedMapString.append(codePointTable.mCodePointToOneByteCodeMap.get( correctCodePointTable.codePointAt(i))); } assertEquals(codePointExpectedMapString.toString(), correctCodePointExpectedMapString); } /** * Adds unigrams to the dictionary. */ private void addUnigrams(final FusionDictionary dict, final List<String> words, final HashMap<String, List<String>> shortcutMap) { for (final String word : words) { final ArrayList<WeightedString> shortcuts = new ArrayList<>(); if (shortcutMap != null && shortcutMap.containsKey(word)) { for (final String shortcut : shortcutMap.get(word)) { shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ)); } } dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ), (shortcutMap == null) ? null : shortcuts, false /* isNotAWord */); } } } Loading
java/src/com/android/inputmethod/latin/makedict/DictionaryHeader.java +1 −0 Original line number Diff line number Diff line Loading @@ -47,6 +47,7 @@ public final class DictionaryHeader { public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; public static final String ATTRIBUTE_VALUE_TRUE = "1"; public static final String CODE_POINT_TABLE_KEY = "codePointTable"; public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions, final FormatOptions formatOptions) throws UnsupportedFormatException { Loading
java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +2 −0 Original line number Diff line number Diff line Loading @@ -237,6 +237,8 @@ public final class FormatSpec { static final int UINT16_MAX = 0xFFFF; static final int UINT24_MAX = 0xFFFFFF; static final int MSB8 = 0x80; static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Options about file format. Loading
tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +19 −1 Original line number Diff line number Diff line Loading @@ -27,6 +27,8 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Map.Entry; /** * Encodes binary files for a FusionDictionary. Loading Loading @@ -791,10 +793,12 @@ public class BinaryDictEncoderUtils { * @param destination the stream to write the file header to. * @param dict the dictionary to write. * @param formatOptions file format options. * @param codePointOccurrenceArray code points ordered by occurrence count. * @return the size of the header. */ /* package */ static int writeDictionaryHeader(final OutputStream destination, final FusionDictionary dict, final FormatOptions formatOptions) final FusionDictionary dict, final FormatOptions formatOptions, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) throws IOException, UnsupportedFormatException { final int version = formatOptions.mVersion; if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION Loading Loading @@ -833,6 +837,9 @@ public class BinaryDictEncoderUtils { CharEncoding.writeString(headerBuffer, key); CharEncoding.writeString(headerBuffer, value); } // TODO: Write out the code point table. final int size = headerBuffer.size(); final byte[] bytes = headerBuffer.toByteArray(); // Write out the header size. Loading @@ -845,4 +852,15 @@ public class BinaryDictEncoderUtils { headerBuffer.close(); return size; } static final class CodePointTable { final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; mCodePointOccurrenceArray = codePointOccurrenceArray; } } }
tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoder.java +51 −1 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; Loading @@ -28,7 +29,11 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.Map.Entry; /** * An implementation of DictEncoder for version 2 binary dictionary. Loading Loading @@ -73,6 +78,46 @@ public class Ver2DictEncoder implements DictEncoder { } } // Package for testing static CodePointTable makeCodePointTable(final FusionDictionary dict) { final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>(); for (final WordProperty word : dict) { // Store per code point occurrence final String wordString = word.mWord; for (int i = 0; i < wordString.length(); ++i) { final int codePoint = Character.codePointAt(wordString, i); if (codePointOccurrenceCounts.containsKey(codePoint)) { codePointOccurrenceCounts.put(codePoint, codePointOccurrenceCounts.get(codePoint) + 1); } else { codePointOccurrenceCounts.put(codePoint, 1); } } } final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray = new ArrayList<>(codePointOccurrenceCounts.entrySet()); // Descending order sort by occurrence (value side) Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() { @Override public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) { return b.getValue().compareTo(a.getValue()); } }); int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; // Temporary map for writing of nodes final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>(); for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { // Put a relation from the original code point to the one byte code. codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex); if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { break; } } // codePointToOneByteCodeMap for writing the trie // codePointOccurrenceArray for writing the header return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray); } @Override public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions) throws IOException, UnsupportedFormatException { Loading @@ -85,7 +130,12 @@ public class Ver2DictEncoder implements DictEncoder { if (mOutStream == null) { openStream(); } BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions); // Make code point conversion table ordered by occurrence of code points final CodePointTable codePointTable = makeCodePointTable(dict); BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions, codePointTable.mCodePointOccurrenceArray); // Addresses are limited to 3 bytes, but since addresses can be relative to each node // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding Loading
tests/src/com/android/inputmethod/latin/makedict/Ver2DictEncoderTests.java 0 → 100644 +91 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; import android.util.Log; /** * Unit tests for Ver2DictEncoder */ @LargeTest public class Ver2DictEncoderTests extends AndroidTestCase { private static final String TAG = Ver2DictEncoderTests.class.getSimpleName(); private static final int UNIGRAM_FREQ = 10; public void testCodePointTable() { final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"}; final List<String> words = Arrays.asList(wordSource); final String correctCodePointTable = "eotdsanirfg bclwup"; final String correctCodePointOccurrenceArrayString = "10141164111411531003110297210521142103111911171108198199132111211021"; final String correctCodePointExpectedMapString = "323433363538373940494147454644424348"; final String dictName = "codePointTableTest"; final String dictVersion = Long.toString(System.currentTimeMillis()); final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(FormatSpec.VERSION2); final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(sourcedict, words, null /* shortcutMap */); final CodePointTable codePointTable = Ver2DictEncoder.makeCodePointTable(sourcedict); // Check if mCodePointOccurrenceArray is correct final StringBuilder codePointOccurrenceArrayString = new StringBuilder(); for (Entry<Integer, Integer> entry : codePointTable.mCodePointOccurrenceArray) { codePointOccurrenceArrayString.append(entry.getKey()); codePointOccurrenceArrayString.append(entry.getValue()); } assertEquals(codePointOccurrenceArrayString.toString(), correctCodePointOccurrenceArrayString); // Check if mCodePointToOneByteCodeMap is correct final StringBuilder codePointExpectedMapString = new StringBuilder(); for (int i = 0; i < correctCodePointTable.length(); ++i) { codePointExpectedMapString.append(codePointTable.mCodePointToOneByteCodeMap.get( correctCodePointTable.codePointAt(i))); } assertEquals(codePointExpectedMapString.toString(), correctCodePointExpectedMapString); } /** * Adds unigrams to the dictionary. */ private void addUnigrams(final FusionDictionary dict, final List<String> words, final HashMap<String, List<String>> shortcutMap) { for (final String word : words) { final ArrayList<WeightedString> shortcuts = new ArrayList<>(); if (shortcutMap != null && shortcutMap.containsKey(word)) { for (final String shortcut : shortcutMap.get(word)) { shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ)); } } dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ), (shortcutMap == null) ? null : shortcuts, false /* isNotAWord */); } } }