Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7e561452 authored by Akifumi Yoshimoto's avatar Akifumi Yoshimoto Committed by Android (Google) Code Review
Browse files

Merge "Include a code point table in the binary dictionary."

parents 1a7da2ec 9168ab60
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -163,13 +163,15 @@ public final class FormatSpec {
    static final int NOT_A_VERSION_NUMBER = -1;

    // These MUST have the same values as the relevant constants in format_utils.h.
    // From version 4 on, we use version * 100 + revision as a version number. That allows
    // From version 2.01 on, we use version * 100 + revision as a version number. That allows
    // us to change the format during development while having testing devices remove
    // older files with each upgrade, while still having a readable versioning scheme.
    // When we bump up the dictionary format version, we should update
    // ExpandableDictionary.needsToMigrateDictionary() and
    // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType().
    public static final int VERSION2 = 2;
    public static final int VERSION201 = 201;
    public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201;
    // Dictionary version used for testing.
    public static final int VERSION4_ONLY_FOR_TESTING = 399;
    public static final int VERSION401 = 401;
+1 −1
Original line number Diff line number Diff line
@@ -312,7 +312,7 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
        final DictBuffer dictBuffer = new ByteArrayDictBuffer(buffer);
        for (final String word : sWords) {
            Arrays.fill(buffer, (byte) 0);
            CharEncoding.writeString(buffer, 0, word);
            CharEncoding.writeString(buffer, 0, word, null);
            dictBuffer.position(0);
            final String str = CharEncoding.readString(dictBuffer);
            assertEquals(word, str);
+40 −18
Original line number Diff line number Diff line
@@ -17,11 +17,11 @@
package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.HashMap;

/**
 * Decodes binary files for a FusionDictionary.
@@ -109,15 +109,19 @@ public final class BinaryDictDecoderUtils {
     * A class grouping utility function for our specific character encoding.
     */
    static final class CharEncoding {
        private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
        private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;

        /**
         * Helper method to find out whether this code fits on one byte
         */
        private static boolean fitsOnOneByte(final int character) {
            return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
                    && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
        private static boolean fitsOnOneByte(int character,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
            if (codePointToOneByteCodeMap != null) {
                if (codePointToOneByteCodeMap.containsKey(character)) {
                    character = codePointToOneByteCodeMap.get(character);
                }
            }
            return character >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
                    && character <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
        }

        /**
@@ -137,9 +141,10 @@ public final class BinaryDictDecoderUtils {
         * @param character the character code.
         * @return the size in binary encoded-form, either 1 or 3 bytes.
         */
        static int getCharSize(final int character) {
        static int getCharSize(final int character,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
            // See char encoding in FusionDictionary.java
            if (fitsOnOneByte(character)) return 1;
            if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
            if (FormatSpec.INVALID_CHARACTER == character) return 1;
            return 3;
        }
@@ -147,9 +152,10 @@ public final class BinaryDictDecoderUtils {
        /**
         * Compute the byte size of a character array.
         */
        static int getCharArraySize(final int[] chars) {
        static int getCharArraySize(final int[] chars,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
            int size = 0;
            for (int character : chars) size += getCharSize(character);
            for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
            return size;
        }

@@ -159,11 +165,19 @@ public final class BinaryDictDecoderUtils {
         * @param codePoints the code point array to write.
         * @param buffer the byte buffer to write to.
         * @param index the index in buffer to write the character array to.
         * @param codePointToOneByteCodeMap the map to convert the code point.
         * @return the index after the last character.
         */
        static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
        static int writeCharArray(final int[] codePoints, final byte[] buffer, int index,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
            for (int codePoint : codePoints) {
                if (1 == getCharSize(codePoint)) {
                if (codePointToOneByteCodeMap != null) {
                    if (codePointToOneByteCodeMap.containsKey(codePoint)) {
                        // Convert code points
                        codePoint = codePointToOneByteCodeMap.get(codePoint);
                    }
                }
                if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
                    buffer[index++] = (byte)codePoint;
                } else {
                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
@@ -184,12 +198,19 @@ public final class BinaryDictDecoderUtils {
         * @param word the string to write.
         * @return the size written, in bytes.
         */
        static int writeString(final byte[] buffer, final int origin, final String word) {
        static int writeString(final byte[] buffer, final int origin, final String word,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
            final int length = word.length();
            int index = origin;
            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                final int codePoint = word.codePointAt(i);
                if (1 == getCharSize(codePoint)) {
                int codePoint = word.codePointAt(i);
                if (codePointToOneByteCodeMap != null) {
                    if (codePointToOneByteCodeMap.containsKey(codePoint)) {
                        // Convert code points
                        codePoint = codePointToOneByteCodeMap.get(codePoint);
                    }
                }
                if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
                    buffer[index++] = (byte)codePoint;
                } else {
                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
@@ -210,12 +231,13 @@ public final class BinaryDictDecoderUtils {
         * @param word the string to write.
         * @return the size written, in bytes.
         */
        static int writeString(final OutputStream stream, final String word) throws IOException {
        static int writeString(final OutputStream stream, final String word,
                final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
            final int length = word.length();
            int written = 0;
            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                final int codePoint = word.codePointAt(i);
                final int charSize = getCharSize(codePoint);
                final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
                if (1 == charSize) {
                    stream.write((byte) codePoint);
                } else {
@@ -253,7 +275,7 @@ public final class BinaryDictDecoderUtils {
         */
        static int readChar(final DictBuffer dictBuffer) {
            int character = dictBuffer.readUnsignedByte();
            if (!fitsOnOneByte(character)) {
            if (!fitsOnOneByte(character, null)) {
                if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
                    return FormatSpec.INVALID_CHARACTER;
                }
+71 −30
Original line number Diff line number Diff line
@@ -61,8 +61,9 @@ public class BinaryDictEncoderUtils {
     * @param characters the character array
     * @return the size of the char array, including the terminator if any
     */
    static int getPtNodeCharactersSize(final int[] characters) {
        int size = CharEncoding.getCharArraySize(characters);
    static int getPtNodeCharactersSize(final int[] characters,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap);
        if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE;
        return size;
    }
@@ -76,8 +77,9 @@ public class BinaryDictEncoderUtils {
     * @param ptNode the PtNode
     * @return the size of the char array, including the terminator if any
     */
    private static int getPtNodeCharactersSize(final PtNode ptNode) {
        return getPtNodeCharactersSize(ptNode.mChars);
    private static int getPtNodeCharactersSize(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap);
    }

    /**
@@ -92,13 +94,14 @@ public class BinaryDictEncoderUtils {
    /**
     * Compute the size of a shortcut in bytes.
     */
    private static int getShortcutSize(final WeightedString shortcut) {
    private static int getShortcutSize(final WeightedString shortcut,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
        final String word = shortcut.mWord;
        final int length = word.length();
        for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
            final int codePoint = word.codePointAt(i);
            size += CharEncoding.getCharSize(codePoint);
            size += CharEncoding.getCharSize(codePoint, codePointToOneByteCodeMap);
        }
        size += FormatSpec.PTNODE_TERMINATOR_SIZE;
        return size;
@@ -110,11 +113,12 @@ public class BinaryDictEncoderUtils {
     * This is known in advance and does not change according to position in the file
     * like address lists do.
     */
    static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
    static int getShortcutListSize(final ArrayList<WeightedString> shortcutList,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        if (null == shortcutList || shortcutList.isEmpty()) return 0;
        int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
        for (final WeightedString shortcut : shortcutList) {
            size += getShortcutSize(shortcut);
            size += getShortcutSize(shortcut, codePointToOneByteCodeMap);
        }
        return size;
    }
@@ -125,14 +129,16 @@ public class BinaryDictEncoderUtils {
     * @param ptNode the PtNode to compute the size of.
     * @return the maximum size of the PtNode.
     */
    private static int getPtNodeMaximumSize(final PtNode ptNode) {
        int size = getNodeHeaderSize(ptNode);
    private static int getPtNodeMaximumSize(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
        if (ptNode.isTerminal()) {
            // If terminal, one byte for the frequency.
            size += FormatSpec.PTNODE_FREQUENCY_SIZE;
        }
        size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
        size += getShortcutListSize(ptNode.mShortcutTargets);
        // TODO: Use codePointToOneByteCodeMap for shortcuts.
        size += getShortcutListSize(ptNode.mShortcutTargets, null /* codePointToOneByteCodeMap */);
        if (null != ptNode.mBigrams) {
            size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE
                    + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE)
@@ -148,10 +154,11 @@ public class BinaryDictEncoderUtils {
     *
     * @param ptNodeArray the node array to compute the maximum size of.
     */
    private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) {
    private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        int size = getPtNodeCountSize(ptNodeArray);
        for (PtNode node : ptNodeArray.mData) {
            final int nodeSize = getPtNodeMaximumSize(node);
            final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap);
            node.mCachedSize = nodeSize;
            size += nodeSize;
        }
@@ -163,8 +170,10 @@ public class BinaryDictEncoderUtils {
     *
     * @param ptNode the PtNode of which to compute the size of the header
     */
    private static int getNodeHeaderSize(final PtNode ptNode) {
        return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode);
    private static int getNodeHeaderSize(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode,
                codePointToOneByteCodeMap);
    }

    /**
@@ -367,7 +376,8 @@ public class BinaryDictEncoderUtils {
     * @return false if none of the cached addresses inside the node array changed, true otherwise.
     */
    private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray,
            final FusionDictionary dict) {
            final FusionDictionary dict,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        boolean changed = false;
        int size = getPtNodeCountSize(ptNodeArray);
        for (PtNode ptNode : ptNodeArray.mData) {
@@ -375,7 +385,7 @@ public class BinaryDictEncoderUtils {
            if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) {
                changed = true;
            }
            int nodeSize = getNodeHeaderSize(ptNode);
            int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
            if (ptNode.isTerminal()) {
                nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
            }
@@ -383,7 +393,9 @@ public class BinaryDictEncoderUtils {
                nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
                        nodeSize + size, ptNode.mChildren));
            }
            nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
            // TODO: Use codePointToOneByteCodeMap for shortcuts.
            nodeSize += getShortcutListSize(ptNode.mShortcutTargets,
                    null /* codePointToOneByteCodeMap */);
            if (null != ptNode.mBigrams) {
                for (WeightedString bigram : ptNode.mBigrams) {
                    final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
@@ -454,10 +466,11 @@ public class BinaryDictEncoderUtils {
     * @return the same array it was passed. The nodes have been updated for address and size.
     */
    /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict,
            final ArrayList<PtNodeArray> flatNodes) {
            final ArrayList<PtNodeArray> flatNodes,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        // First get the worst possible sizes and offsets
        for (final PtNodeArray n : flatNodes) {
            calculatePtNodeArrayMaximumSize(n);
            calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap);
        }
        final int offset = initializePtNodeArraysCachedAddresses(flatNodes);

@@ -472,7 +485,8 @@ public class BinaryDictEncoderUtils {
            for (final PtNodeArray ptNodeArray : flatNodes) {
                ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset;
                final int oldNodeArraySize = ptNodeArray.mCachedSize;
                final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict);
                final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict,
                        codePointToOneByteCodeMap);
                final int newNodeArraySize = ptNodeArray.mCachedSize;
                if (oldNodeArraySize < newNodeArraySize) {
                    throw new RuntimeException("Increased size ?!");
@@ -686,9 +700,10 @@ public class BinaryDictEncoderUtils {
                + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY);
    }

    /* package */ static final int getChildrenPosition(final PtNode ptNode) {
    /* package */ static final int getChildrenPosition(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate
                + getNodeHeaderSize(ptNode);
                + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap);
        if (ptNode.isTerminal()) {
            // A terminal node has the frequency.
            // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children
@@ -705,10 +720,12 @@ public class BinaryDictEncoderUtils {
     * @param dict the dictionary the node array is a part of (for relative offsets).
     * @param dictEncoder the dictionary encoder.
     * @param ptNodeArray the node array to write.
     * @param codePointToOneByteCodeMap the map to convert the code points.
     */
    @SuppressWarnings("unused")
    /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict,
            final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) {
            final DictEncoder dictEncoder, final PtNodeArray ptNodeArray,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        // TODO: Make the code in common with BinaryDictIOUtils#writePtNode
        dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate);

@@ -727,7 +744,7 @@ public class BinaryDictEncoderUtils {
                        + FormatSpec.MAX_TERMINAL_FREQUENCY
                        + " : " + ptNode.mProbabilityInfo.toString());
            }
            dictEncoder.writePtNode(ptNode, dict);
            dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap);
        }
        if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate
                + ptNodeArray.mCachedSize) {
@@ -834,12 +851,16 @@ public class BinaryDictEncoderUtils {
        // Write out the options.
        for (final String key : dict.mOptions.mAttributes.keySet()) {
            final String value = dict.mOptions.mAttributes.get(key);
            CharEncoding.writeString(headerBuffer, key);
            CharEncoding.writeString(headerBuffer, value);
            CharEncoding.writeString(headerBuffer, key, null);
            CharEncoding.writeString(headerBuffer, value, null);
        }
        // Write out the codePointTable if there is codePointOccurrenceArray.
        if (codePointOccurrenceArray != null) {
            final String codePointTableString =
                    encodeCodePointTable(codePointOccurrenceArray);
            CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null);
            CharEncoding.writeString(headerBuffer, codePointTableString, null);
        }

        // TODO: Write out the code point table.

        final int size = headerBuffer.size();
        final byte[] bytes = headerBuffer.toByteArray();
        // Write out the header size.
@@ -857,10 +878,30 @@ public class BinaryDictEncoderUtils {
        final HashMap<Integer, Integer> mCodePointToOneByteCodeMap;
        final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray;

        // Let code point table empty for version 200 dictionary which used in test
        CodePointTable() {
            mCodePointToOneByteCodeMap = null;
            mCodePointOccurrenceArray = null;
        }

        CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap,
                final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
            mCodePointToOneByteCodeMap = codePointToOneByteCodeMap;
            mCodePointOccurrenceArray = codePointOccurrenceArray;
        }
    }

    private static String encodeCodePointTable(
            final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
        final StringBuilder codePointTableString = new StringBuilder();
        int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
        for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
            // Native reads the table as a string
            codePointTableString.appendCodePoint(entry.getKey());
            if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
                break;
            }
        }
        return codePointTableString.toString();
    }
}
+3 −1
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;

import java.io.IOException;
import java.util.HashMap;

/**
 * An interface of binary dictionary encoder.
@@ -33,5 +34,6 @@ public interface DictEncoder {
    public void setPosition(final int position);
    public int getPosition();
    public void writePtNodeCount(final int ptNodeCount);
    public void writePtNode(final PtNode ptNode, final FusionDictionary dict);
    public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap);
}
Loading