Loading java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +3 −1 Original line number Diff line number Diff line Loading @@ -163,13 +163,15 @@ public final class FormatSpec { static final int NOT_A_VERSION_NUMBER = -1; // These MUST have the same values as the relevant constants in format_utils.h. // From version 4 on, we use version * 100 + revision as a version number. That allows // From version 2.01 on, we use version * 100 + revision as a version number. That allows // us to change the format during development while having testing devices remove // older files with each upgrade, while still having a readable versioning scheme. // When we bump up the dictionary format version, we should update // ExpandableDictionary.needsToMigrateDictionary() and // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). public static final int VERSION2 = 2; public static final int VERSION201 = 201; public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; // Dictionary version used for testing. public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION401 = 401; Loading tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +1 −1 Original line number Diff line number Diff line Loading @@ -312,7 +312,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer); for (final String word : sWords) { Arrays.fill(buffer, (byte) 0); CharEncoding.writeString(buffer, 0, word); CharEncoding.writeString(buffer, 0, word, null); dictBuffer.position(0); final String str = CharEncoding.readString(dictBuffer); assertEquals(word, str); Loading tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +40 −18 Original line number Diff line number Diff line Loading @@ -17,11 +17,11 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.HashMap; /** * Decodes binary files for a FusionDictionary. Loading Loading @@ -109,15 +109,19 @@ public final class BinaryDictDecoderUtils { * A class grouping utility function for our specific character encoding. */ static final class CharEncoding { private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Helper method to find out whether this code fits on one byte */ private static boolean fitsOnOneByte(final int character) { return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; private static boolean fitsOnOneByte(int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(character)) { character = codePointToOneByteCodeMap.get(character); } } return character >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE && character <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE; } /** Loading @@ -137,9 +141,10 @@ public final class BinaryDictDecoderUtils { * @param character the character code. * @return the size in binary encoded-form, either 1 or 3 bytes. */ static int getCharSize(final int character) { static int getCharSize(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // See char encoding in FusionDictionary.java if (fitsOnOneByte(character)) return 1; if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1; if (FormatSpec.INVALID_CHARACTER == character) return 1; return 3; } Loading @@ -147,9 +152,10 @@ public final class BinaryDictDecoderUtils { /** * Compute the byte size of a character array. */ static int getCharArraySize(final int[] chars) { static int getCharArraySize(final int[] chars, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = 0; for (int character : chars) size += getCharSize(character); for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap); return size; } Loading @@ -159,11 +165,19 @@ public final class BinaryDictDecoderUtils { * @param codePoints the code point array to write. * @param buffer the byte buffer to write to. * @param index the index in buffer to write the character array to. * @param codePointToOneByteCodeMap the map to convert the code point. * @return the index after the last character. */ static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { static int writeCharArray(final int[] codePoints, final byte[] buffer, int index, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { for (int codePoint : codePoints) { if (1 == getCharSize(codePoint)) { if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(codePoint)) { // Convert code points codePoint = codePointToOneByteCodeMap.get(codePoint); } } if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { buffer[index++] = (byte)codePoint; } else { buffer[index++] = (byte)(0xFF & (codePoint >> 16)); Loading @@ -184,12 +198,19 @@ public final class BinaryDictDecoderUtils { * @param word the string to write. * @return the size written, in bytes. */ static int writeString(final byte[] buffer, final int origin, final String word) { static int writeString(final byte[] buffer, final int origin, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { final int length = word.length(); int index = origin; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); if (1 == getCharSize(codePoint)) { int codePoint = word.codePointAt(i); if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(codePoint)) { // Convert code points codePoint = codePointToOneByteCodeMap.get(codePoint); } } if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { buffer[index++] = (byte)codePoint; } else { buffer[index++] = (byte)(0xFF & (codePoint >> 16)); Loading @@ -210,12 +231,13 @@ public final class BinaryDictDecoderUtils { * @param word the string to write. * @return the size written, in bytes. */ static int writeString(final OutputStream stream, final String word) throws IOException { static int writeString(final OutputStream stream, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException { final int length = word.length(); int written = 0; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); final int charSize = getCharSize(codePoint); final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap); if (1 == charSize) { stream.write((byte) codePoint); } else { Loading Loading @@ -253,7 +275,7 @@ public final class BinaryDictDecoderUtils { */ static int readChar(final DictBuffer dictBuffer) { int character = dictBuffer.readUnsignedByte(); if (!fitsOnOneByte(character)) { if (!fitsOnOneByte(character, null)) { if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { return FormatSpec.INVALID_CHARACTER; } Loading tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +71 −30 Original line number Diff line number Diff line Loading @@ -61,8 +61,9 @@ public class BinaryDictEncoderUtils { * @param characters the character array * @return the size of the char array, including the terminator if any */ static int getPtNodeCharactersSize(final int[] characters) { int size = CharEncoding.getCharArraySize(characters); static int getPtNodeCharactersSize(final int[] characters, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap); if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; return size; } Loading @@ -76,8 +77,9 @@ public class BinaryDictEncoderUtils { * @param ptNode the PtNode * @return the size of the char array, including the terminator if any */ private static int getPtNodeCharactersSize(final PtNode ptNode) { return getPtNodeCharactersSize(ptNode.mChars); private static int getPtNodeCharactersSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap); } /** Loading @@ -92,13 +94,14 @@ public class BinaryDictEncoderUtils { /** * Compute the size of a shortcut in bytes. */ private static int getShortcutSize(final WeightedString shortcut) { private static int getShortcutSize(final WeightedString shortcut, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; final String word = shortcut.mWord; final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); size += CharEncoding.getCharSize(codePoint); size += CharEncoding.getCharSize(codePoint, codePointToOneByteCodeMap); } size += FormatSpec.PTNODE_TERMINATOR_SIZE; return size; Loading @@ -110,11 +113,12 @@ public class BinaryDictEncoderUtils { * This is known in advance and does not change according to position in the file * like address lists do. */ static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { static int getShortcutListSize(final ArrayList<WeightedString> shortcutList, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { if (null == shortcutList || shortcutList.isEmpty()) return 0; int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; for (final WeightedString shortcut : shortcutList) { size += getShortcutSize(shortcut); size += getShortcutSize(shortcut, codePointToOneByteCodeMap); } return size; } Loading @@ -125,14 +129,16 @@ public class BinaryDictEncoderUtils { * @param ptNode the PtNode to compute the size of. * @return the maximum size of the PtNode. */ private static int getPtNodeMaximumSize(final PtNode ptNode) { int size = getNodeHeaderSize(ptNode); private static int getPtNodeMaximumSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { // If terminal, one byte for the frequency. size += FormatSpec.PTNODE_FREQUENCY_SIZE; } size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address size += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts. size += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */); if (null != ptNode.mBigrams) { size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) Loading @@ -148,10 +154,11 @@ public class BinaryDictEncoderUtils { * * @param ptNodeArray the node array to compute the maximum size of. */ private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) { private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = getPtNodeCountSize(ptNodeArray); for (PtNode node : ptNodeArray.mData) { final int nodeSize = getPtNodeMaximumSize(node); final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap); node.mCachedSize = nodeSize; size += nodeSize; } Loading @@ -163,8 +170,10 @@ public class BinaryDictEncoderUtils { * * @param ptNode the PtNode of which to compute the size of the header */ private static int getNodeHeaderSize(final PtNode ptNode) { return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); private static int getNodeHeaderSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode, codePointToOneByteCodeMap); } /** Loading Loading @@ -367,7 +376,8 @@ public class BinaryDictEncoderUtils { * @return false if none of the cached addresses inside the node array changed, true otherwise. */ private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, final FusionDictionary dict) { final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { boolean changed = false; int size = getPtNodeCountSize(ptNodeArray); for (PtNode ptNode : ptNodeArray.mData) { Loading @@ -375,7 +385,7 @@ public class BinaryDictEncoderUtils { if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { changed = true; } int nodeSize = getNodeHeaderSize(ptNode); int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; } Loading @@ -383,7 +393,9 @@ public class BinaryDictEncoderUtils { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize + size, ptNode.mChildren)); } nodeSize += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts. nodeSize += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */); if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, Loading Loading @@ -454,10 +466,11 @@ public class BinaryDictEncoderUtils { * @return the same array it was passed. The nodes have been updated for address and size. */ /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, final ArrayList<PtNodeArray> flatNodes) { final ArrayList<PtNodeArray> flatNodes, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // First get the worst possible sizes and offsets for (final PtNodeArray n : flatNodes) { calculatePtNodeArrayMaximumSize(n); calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap); } final int offset = initializePtNodeArraysCachedAddresses(flatNodes); Loading @@ -472,7 +485,8 @@ public class BinaryDictEncoderUtils { for (final PtNodeArray ptNodeArray : flatNodes) { ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; final int oldNodeArraySize = ptNodeArray.mCachedSize; final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict); final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict, codePointToOneByteCodeMap); final int newNodeArraySize = ptNodeArray.mCachedSize; if (oldNodeArraySize < newNodeArraySize) { throw new RuntimeException("Increased size ?!"); Loading Loading @@ -686,9 +700,10 @@ public class BinaryDictEncoderUtils { + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); } /* package */ static final int getChildrenPosition(final PtNode ptNode) { /* package */ static final int getChildrenPosition(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate + getNodeHeaderSize(ptNode); + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { // A terminal node has the frequency. // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children Loading @@ -705,10 +720,12 @@ public class BinaryDictEncoderUtils { * @param dict the dictionary the node array is a part of (for relative offsets). * @param dictEncoder the dictionary encoder. * @param ptNodeArray the node array to write. * @param codePointToOneByteCodeMap the map to convert the code points. */ @SuppressWarnings("unused") /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) { final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // TODO: Make the code in common with BinaryDictIOUtils#writePtNode dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); Loading @@ -727,7 +744,7 @@ public class BinaryDictEncoderUtils { + FormatSpec.MAX_TERMINAL_FREQUENCY + " : " + ptNode.mProbabilityInfo.toString()); } dictEncoder.writePtNode(ptNode, dict); dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap); } if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize) { Loading Loading @@ -834,12 +851,16 @@ public class BinaryDictEncoderUtils { // Write out the options. for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); CharEncoding.writeString(headerBuffer, key); CharEncoding.writeString(headerBuffer, value); CharEncoding.writeString(headerBuffer, key, null); CharEncoding.writeString(headerBuffer, value, null); } // Write out the codePointTable if there is codePointOccurrenceArray. if (codePointOccurrenceArray != null) { final String codePointTableString = encodeCodePointTable(codePointOccurrenceArray); CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null); CharEncoding.writeString(headerBuffer, codePointTableString, null); } // TODO: Write out the code point table. final int size = headerBuffer.size(); final byte[] bytes = headerBuffer.toByteArray(); // Write out the header size. Loading @@ -857,10 +878,30 @@ public class BinaryDictEncoderUtils { final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; // Let code point table empty for version 200 dictionary which used in test CodePointTable() { mCodePointToOneByteCodeMap = null; mCodePointOccurrenceArray = null; } CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; mCodePointOccurrenceArray = codePointOccurrenceArray; } } private static String encodeCodePointTable( final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { final StringBuilder codePointTableString = new StringBuilder(); int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { // Native reads the table as a string codePointTableString.appendCodePoint(entry.getKey()); if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { break; } } return codePointTableString.toString(); } } tests/src/com/android/inputmethod/latin/makedict/DictEncoder.java +3 −1 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import java.io.IOException; import java.util.HashMap; /** * An interface of binary dictionary encoder. Loading @@ -33,5 +34,6 @@ public interface DictEncoder { public void setPosition(final int position); public int getPosition(); public void writePtNodeCount(final int ptNodeCount); public void writePtNode(final PtNode ptNode, final FusionDictionary dict); public void writePtNode(final PtNode ptNode, final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap); } Loading
java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +3 −1 Original line number Diff line number Diff line Loading @@ -163,13 +163,15 @@ public final class FormatSpec { static final int NOT_A_VERSION_NUMBER = -1; // These MUST have the same values as the relevant constants in format_utils.h. // From version 4 on, we use version * 100 + revision as a version number. That allows // From version 2.01 on, we use version * 100 + revision as a version number. That allows // us to change the format during development while having testing devices remove // older files with each upgrade, while still having a readable versioning scheme. // When we bump up the dictionary format version, we should update // ExpandableDictionary.needsToMigrateDictionary() and // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). public static final int VERSION2 = 2; public static final int VERSION201 = 201; public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; // Dictionary version used for testing. public static final int VERSION4_ONLY_FOR_TESTING = 399; public static final int VERSION401 = 401; Loading
tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +1 −1 Original line number Diff line number Diff line Loading @@ -312,7 +312,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase { final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer); for (final String word : sWords) { Arrays.fill(buffer, (byte) 0); CharEncoding.writeString(buffer, 0, word); CharEncoding.writeString(buffer, 0, word, null); dictBuffer.position(0); final String str = CharEncoding.readString(dictBuffer); assertEquals(word, str); Loading
tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +40 −18 Original line number Diff line number Diff line Loading @@ -17,11 +17,11 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.HashMap; /** * Decodes binary files for a FusionDictionary. Loading Loading @@ -109,15 +109,19 @@ public final class BinaryDictDecoderUtils { * A class grouping utility function for our specific character encoding. */ static final class CharEncoding { private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; /** * Helper method to find out whether this code fits on one byte */ private static boolean fitsOnOneByte(final int character) { return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; private static boolean fitsOnOneByte(int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(character)) { character = codePointToOneByteCodeMap.get(character); } } return character >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE && character <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE; } /** Loading @@ -137,9 +141,10 @@ public final class BinaryDictDecoderUtils { * @param character the character code. * @return the size in binary encoded-form, either 1 or 3 bytes. */ static int getCharSize(final int character) { static int getCharSize(final int character, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // See char encoding in FusionDictionary.java if (fitsOnOneByte(character)) return 1; if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1; if (FormatSpec.INVALID_CHARACTER == character) return 1; return 3; } Loading @@ -147,9 +152,10 @@ public final class BinaryDictDecoderUtils { /** * Compute the byte size of a character array. */ static int getCharArraySize(final int[] chars) { static int getCharArraySize(final int[] chars, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = 0; for (int character : chars) size += getCharSize(character); for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap); return size; } Loading @@ -159,11 +165,19 @@ public final class BinaryDictDecoderUtils { * @param codePoints the code point array to write. * @param buffer the byte buffer to write to. * @param index the index in buffer to write the character array to. * @param codePointToOneByteCodeMap the map to convert the code point. * @return the index after the last character. */ static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { static int writeCharArray(final int[] codePoints, final byte[] buffer, int index, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { for (int codePoint : codePoints) { if (1 == getCharSize(codePoint)) { if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(codePoint)) { // Convert code points codePoint = codePointToOneByteCodeMap.get(codePoint); } } if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { buffer[index++] = (byte)codePoint; } else { buffer[index++] = (byte)(0xFF & (codePoint >> 16)); Loading @@ -184,12 +198,19 @@ public final class BinaryDictDecoderUtils { * @param word the string to write. * @return the size written, in bytes. */ static int writeString(final byte[] buffer, final int origin, final String word) { static int writeString(final byte[] buffer, final int origin, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { final int length = word.length(); int index = origin; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); if (1 == getCharSize(codePoint)) { int codePoint = word.codePointAt(i); if (codePointToOneByteCodeMap != null) { if (codePointToOneByteCodeMap.containsKey(codePoint)) { // Convert code points codePoint = codePointToOneByteCodeMap.get(codePoint); } } if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) { buffer[index++] = (byte)codePoint; } else { buffer[index++] = (byte)(0xFF & (codePoint >> 16)); Loading @@ -210,12 +231,13 @@ public final class BinaryDictDecoderUtils { * @param word the string to write. * @return the size written, in bytes. */ static int writeString(final OutputStream stream, final String word) throws IOException { static int writeString(final OutputStream stream, final String word, final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException { final int length = word.length(); int written = 0; for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); final int charSize = getCharSize(codePoint); final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap); if (1 == charSize) { stream.write((byte) codePoint); } else { Loading Loading @@ -253,7 +275,7 @@ public final class BinaryDictDecoderUtils { */ static int readChar(final DictBuffer dictBuffer) { int character = dictBuffer.readUnsignedByte(); if (!fitsOnOneByte(character)) { if (!fitsOnOneByte(character, null)) { if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) { return FormatSpec.INVALID_CHARACTER; } Loading
tests/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +71 −30 Original line number Diff line number Diff line Loading @@ -61,8 +61,9 @@ public class BinaryDictEncoderUtils { * @param characters the character array * @return the size of the char array, including the terminator if any */ static int getPtNodeCharactersSize(final int[] characters) { int size = CharEncoding.getCharArraySize(characters); static int getPtNodeCharactersSize(final int[] characters, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap); if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; return size; } Loading @@ -76,8 +77,9 @@ public class BinaryDictEncoderUtils { * @param ptNode the PtNode * @return the size of the char array, including the terminator if any */ private static int getPtNodeCharactersSize(final PtNode ptNode) { return getPtNodeCharactersSize(ptNode.mChars); private static int getPtNodeCharactersSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap); } /** Loading @@ -92,13 +94,14 @@ public class BinaryDictEncoderUtils { /** * Compute the size of a shortcut in bytes. */ private static int getShortcutSize(final WeightedString shortcut) { private static int getShortcutSize(final WeightedString shortcut, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; final String word = shortcut.mWord; final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); size += CharEncoding.getCharSize(codePoint); size += CharEncoding.getCharSize(codePoint, codePointToOneByteCodeMap); } size += FormatSpec.PTNODE_TERMINATOR_SIZE; return size; Loading @@ -110,11 +113,12 @@ public class BinaryDictEncoderUtils { * This is known in advance and does not change according to position in the file * like address lists do. */ static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { static int getShortcutListSize(final ArrayList<WeightedString> shortcutList, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { if (null == shortcutList || shortcutList.isEmpty()) return 0; int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; for (final WeightedString shortcut : shortcutList) { size += getShortcutSize(shortcut); size += getShortcutSize(shortcut, codePointToOneByteCodeMap); } return size; } Loading @@ -125,14 +129,16 @@ public class BinaryDictEncoderUtils { * @param ptNode the PtNode to compute the size of. * @return the maximum size of the PtNode. */ private static int getPtNodeMaximumSize(final PtNode ptNode) { int size = getNodeHeaderSize(ptNode); private static int getPtNodeMaximumSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { // If terminal, one byte for the frequency. size += FormatSpec.PTNODE_FREQUENCY_SIZE; } size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address size += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts. size += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */); if (null != ptNode.mBigrams) { size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) Loading @@ -148,10 +154,11 @@ public class BinaryDictEncoderUtils { * * @param ptNodeArray the node array to compute the maximum size of. */ private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) { private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int size = getPtNodeCountSize(ptNodeArray); for (PtNode node : ptNodeArray.mData) { final int nodeSize = getPtNodeMaximumSize(node); final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap); node.mCachedSize = nodeSize; size += nodeSize; } Loading @@ -163,8 +170,10 @@ public class BinaryDictEncoderUtils { * * @param ptNode the PtNode of which to compute the size of the header */ private static int getNodeHeaderSize(final PtNode ptNode) { return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); private static int getNodeHeaderSize(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode, codePointToOneByteCodeMap); } /** Loading Loading @@ -367,7 +376,8 @@ public class BinaryDictEncoderUtils { * @return false if none of the cached addresses inside the node array changed, true otherwise. */ private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, final FusionDictionary dict) { final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { boolean changed = false; int size = getPtNodeCountSize(ptNodeArray); for (PtNode ptNode : ptNodeArray.mData) { Loading @@ -375,7 +385,7 @@ public class BinaryDictEncoderUtils { if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { changed = true; } int nodeSize = getNodeHeaderSize(ptNode); int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; } Loading @@ -383,7 +393,9 @@ public class BinaryDictEncoderUtils { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize + size, ptNode.mChildren)); } nodeSize += getShortcutListSize(ptNode.mShortcutTargets); // TODO: Use codePointToOneByteCodeMap for shortcuts. nodeSize += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */); if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, Loading Loading @@ -454,10 +466,11 @@ public class BinaryDictEncoderUtils { * @return the same array it was passed. The nodes have been updated for address and size. */ /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, final ArrayList<PtNodeArray> flatNodes) { final ArrayList<PtNodeArray> flatNodes, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // First get the worst possible sizes and offsets for (final PtNodeArray n : flatNodes) { calculatePtNodeArrayMaximumSize(n); calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap); } final int offset = initializePtNodeArraysCachedAddresses(flatNodes); Loading @@ -472,7 +485,8 @@ public class BinaryDictEncoderUtils { for (final PtNodeArray ptNodeArray : flatNodes) { ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; final int oldNodeArraySize = ptNodeArray.mCachedSize; final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict); final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict, codePointToOneByteCodeMap); final int newNodeArraySize = ptNodeArray.mCachedSize; if (oldNodeArraySize < newNodeArraySize) { throw new RuntimeException("Increased size ?!"); Loading Loading @@ -686,9 +700,10 @@ public class BinaryDictEncoderUtils { + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); } /* package */ static final int getChildrenPosition(final PtNode ptNode) { /* package */ static final int getChildrenPosition(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate + getNodeHeaderSize(ptNode); + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); if (ptNode.isTerminal()) { // A terminal node has the frequency. // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children Loading @@ -705,10 +720,12 @@ public class BinaryDictEncoderUtils { * @param dict the dictionary the node array is a part of (for relative offsets). * @param dictEncoder the dictionary encoder. * @param ptNodeArray the node array to write. * @param codePointToOneByteCodeMap the map to convert the code points. */ @SuppressWarnings("unused") /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) { final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, final HashMap<Integer, Integer> codePointToOneByteCodeMap) { // TODO: Make the code in common with BinaryDictIOUtils#writePtNode dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); Loading @@ -727,7 +744,7 @@ public class BinaryDictEncoderUtils { + FormatSpec.MAX_TERMINAL_FREQUENCY + " : " + ptNode.mProbabilityInfo.toString()); } dictEncoder.writePtNode(ptNode, dict); dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap); } if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize) { Loading Loading @@ -834,12 +851,16 @@ public class BinaryDictEncoderUtils { // Write out the options. for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); CharEncoding.writeString(headerBuffer, key); CharEncoding.writeString(headerBuffer, value); CharEncoding.writeString(headerBuffer, key, null); CharEncoding.writeString(headerBuffer, value, null); } // Write out the codePointTable if there is codePointOccurrenceArray. if (codePointOccurrenceArray != null) { final String codePointTableString = encodeCodePointTable(codePointOccurrenceArray); CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null); CharEncoding.writeString(headerBuffer, codePointTableString, null); } // TODO: Write out the code point table. final int size = headerBuffer.size(); final byte[] bytes = headerBuffer.toByteArray(); // Write out the header size. Loading @@ -857,10 +878,30 @@ public class BinaryDictEncoderUtils { final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; // Let code point table empty for version 200 dictionary which used in test CodePointTable() { mCodePointToOneByteCodeMap = null; mCodePointOccurrenceArray = null; } CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; mCodePointOccurrenceArray = codePointOccurrenceArray; } } private static String encodeCodePointTable( final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { final StringBuilder codePointTableString = new StringBuilder(); int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { // Native reads the table as a string codePointTableString.appendCodePoint(entry.getKey()); if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { break; } } return codePointTableString.toString(); } }
tests/src/com/android/inputmethod/latin/makedict/DictEncoder.java +3 −1 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import java.io.IOException; import java.util.HashMap; /** * An interface of binary dictionary encoder. Loading @@ -33,5 +34,6 @@ public interface DictEncoder { public void setPosition(final int position); public int getPosition(); public void writePtNodeCount(final int ptNodeCount); public void writePtNode(final PtNode ptNode, final FusionDictionary dict); public void writePtNode(final PtNode ptNode, final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap); }