Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 78409cd9 authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Separate shortcut targets from the trie file."

parents 0b626214 73b9d3b8
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -23,11 +23,11 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
@@ -219,14 +219,14 @@ public final class BinaryDictDecoderUtils {
        }

        /**
         * Writes a string with our character format to a ByteArrayOutputStream.
         * Writes a string with our character format to an OutputStream.
         *
         * This will also write the terminator byte.
         *
         * @param buffer the ByteArrayOutputStream to write to.
         * @param buffer the OutputStream to write to.
         * @param word the string to write.
         */
        static void writeString(final ByteArrayOutputStream buffer, final String word) {
        static void writeString(final OutputStream buffer, final String word) throws IOException {
            final int length = word.length();
            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
                final int codePoint = word.codePointAt(i);
+1 −1
Original line number Diff line number Diff line
@@ -383,8 +383,8 @@ public class BinaryDictEncoderUtils {
                nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
                        nodeSize + size, ptNode.mChildren));
            }
            nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
            if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
                nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
                if (null != ptNode.mBigrams) {
                    for (WeightedString bigram : ptNode.mBigrams) {
                        final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
+13 −0
Original line number Diff line number Diff line
@@ -266,15 +266,28 @@ public final class FormatSpec {
    // tat = Terminal Address Table
    static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
    static final String BIGRAM_FILE_EXTENSION = ".bigram";
    static final String SHORTCUT_FILE_EXTENSION = ".shortcut";
    static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup";
    static final String CONTENT_TABLE_FILE_SUFFIX = "_index";
    static final int FREQUENCY_AND_FLAGS_SIZE = 2;
    static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;

    // With the English main dictionary as of October 2013, the size of bigram address table is
    // is 584KB with the block size being 4.
    // This is 91% of that of full address table.
    static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
    static final int BIGRAM_CONTENT_COUNT = 1;
    static final int BIGRAM_FREQ_CONTENT_INDEX = 0;
    static final String BIGRAM_FREQ_CONTENT_ID = "_freq";

    static final int SHORTCUT_CONTENT_COUNT = 1;
    static final int SHORTCUT_CONTENT_INDEX = 0;
    // With the English main dictionary as of October 2013, the size of shortcut address table is
    // 29KB with the block size being 64.
    // This is only 4.4% of that of full address table.
    static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
    static final String SHORTCUT_CONTENT_ID = "_shortcut";

    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
    static final int NO_PARENT_ADDRESS = 0;
    static final int NO_FORWARD_LINK_ADDRESS = 0;
+40 −8
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import com.android.inputmethod.latin.utils.CollectionUtils;

import android.util.Log;

@@ -43,6 +44,7 @@ public class Ver4DictDecoder extends DictDecoder {
    private static final int FILETYPE_FREQUENCY = 2;
    private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
    private static final int FILETYPE_BIGRAM_FREQ = 4;
    private static final int FILETYPE_SHORTCUT = 5;

    private final File mDictDirectory;
    private final DictionaryBufferFactory mBufferFactory;
@@ -50,7 +52,9 @@ public class Ver4DictDecoder extends DictDecoder {
    private DictBuffer mFrequencyBuffer;
    private DictBuffer mTerminalAddressTableBuffer;
    private DictBuffer mBigramBuffer;
    private DictBuffer mShortcutBuffer;
    private SparseTable mBigramAddressTable;
    private SparseTable mShortcutAddressTable;

    @UsedForTesting
    /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
@@ -89,6 +93,10 @@ public class Ver4DictDecoder extends DictDecoder {
            return new File(mDictDirectory,
                    mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION
                            + FormatSpec.BIGRAM_FREQ_CONTENT_ID);
        } else if (fileType == FILETYPE_SHORTCUT) {
            return new File(mDictDirectory,
                    mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION
                            + FormatSpec.SHORTCUT_CONTENT_ID);
        } else {
            throw new RuntimeException("Unsupported kind of file : " + fileType);
        }
@@ -102,6 +110,8 @@ public class Ver4DictDecoder extends DictDecoder {
                getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
        mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ));
        loadBigramAddressSparseTable();
        mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT));
        loadShortcutAddressSparseTable();
    }

    @Override
@@ -136,6 +146,18 @@ public class Ver4DictDecoder extends DictDecoder {
                FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
    }

    // TODO: Let's have something like SparseTableContentsReader in this class.
    private void loadShortcutAddressSparseTable() throws IOException {
        final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName()
                + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX);
        final File contentFile = new File(mDictDirectory, mDictDirectory.getName()
                + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX
                + FormatSpec.SHORTCUT_CONTENT_ID);
        mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile,
                new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE);
    }


    protected static class PtNodeReader extends DictDecoder.PtNodeReader {
        protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
            frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
@@ -147,6 +169,23 @@ public class Ver4DictDecoder extends DictDecoder {
        }
    }

    private ArrayList<WeightedString> readShortcuts(final int terminalId) {
        if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null;

        final ArrayList<WeightedString> ret = CollectionUtils.newArrayList();
        final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX,
                terminalId);
        mShortcutBuffer.position(posOfShortcuts);
        while (true) {
            final int flags = mShortcutBuffer.readUnsignedByte();
            final String word = CharEncoding.readString(mShortcutBuffer);
            ret.add(new WeightedString(word,
                    flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
            if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
        }
        return ret;
    }

    // TODO: Make this buffer thread safe.
    // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH.
    private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
@@ -197,14 +236,7 @@ public class Ver4DictDecoder extends DictDecoder {
            childrenAddress += addressPointer;
        }
        addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options);
        final ArrayList<WeightedString> shortcutTargets;
        if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
            // readShortcut will add shortcuts to shortcutTargets.
            shortcutTargets = new ArrayList<WeightedString>();
            addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
        } else {
            shortcutTargets = null;
        }
        final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId);

        final ArrayList<PendingAttribute> bigrams;
        if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
+49 −24
Original line number Diff line number Diff line
@@ -49,6 +49,7 @@ public class Ver4DictEncoder implements DictEncoder {
    private File mDictDir;
    private String mBaseFilename;
    private BigramContentWriter mBigramWriter;
    private ShortcutContentWriter mShortcutWriter;

    @UsedForTesting
    public Ver4DictEncoder(final File dictPlacedDir) {
@@ -152,6 +153,39 @@ public class Ver4DictEncoder implements DictEncoder {
        }
    }

    private static class ShortcutContentWriter extends SparseTableContentWriter {
        public ShortcutContentWriter(final String name, final int initialCapacity,
                final File baseDir) {
            super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT,
                    initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir,
                    new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION },
                    new String[] { FormatSpec.SHORTCUT_CONTENT_ID });
        }

        public void writeShortcutForOneWord(final int terminalId,
                final Iterator<WeightedString> shortcutIterator) throws IOException {
            write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId,
                    new SparseTableContentWriterInterface() {
                        @Override
                        public void write(final OutputStream outStream) throws IOException {
                            writeShortcutForOneWordInternal(outStream, shortcutIterator);
                        }
                    });
        }

        private void writeShortcutForOneWordInternal(final OutputStream outStream,
                final Iterator<WeightedString> shortcutIterator) throws IOException {
            while (shortcutIterator.hasNext()) {
                final WeightedString target = shortcutIterator.next();
                final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
                        shortcutIterator.hasNext(), target.mFrequency);
                BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags,
                        FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
                CharEncoding.writeString(outStream, target.mWord);
            }
        }
    }

    private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
            throws FileNotFoundException, IOException {
        final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
@@ -225,6 +259,8 @@ public class Ver4DictEncoder implements DictEncoder {
        writeTerminalData(flatNodes, terminalCount);
        mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir);
        writeBigrams(flatNodes, dict);
        mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir);
        writeShortcuts(flatNodes);

        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
@@ -306,29 +342,6 @@ public class Ver4DictEncoder implements DictEncoder {
        }
    }

    private void writeShortcuts(ArrayList<WeightedString> shortcuts) {
        if (null == shortcuts || shortcuts.isEmpty()) return;

        final int indexOfShortcutByteSize = mTriePos;
        mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
        final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
        while (shortcutIterator.hasNext()) {
            final WeightedString target = shortcutIterator.next();
            final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
                    shortcutIterator.hasNext(), target.mFrequency);
            mTrieBuf[mTriePos++] = (byte)shortcutFlags;
            final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos,
                    target.mWord);
            mTriePos += shortcutShift;
        }
        final int shortcutByteSize = mTriePos - indexOfShortcutByteSize;
        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
            throw new RuntimeException("Shortcut list too large : " + shortcutByteSize);
        }
        BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize,
                shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
    }

    private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
            throws IOException {
        mBigramWriter.openStreams();
@@ -343,6 +356,19 @@ public class Ver4DictEncoder implements DictEncoder {
        mBigramWriter.closeStreams();
    }

    private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException {
        mShortcutWriter.openStreams();
        for (final PtNodeArray nodeArray : flatNodes) {
            for (final PtNode ptNode : nodeArray.mData) {
                if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) {
                    mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId,
                            ptNode.mShortcutTargets.iterator());
                }
            }
        }
        mShortcutWriter.closeStreams();
    }

    @Override
    public void writeForwardLinkAddress(int forwardLinkAddress) {
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
@@ -359,7 +385,6 @@ public class Ver4DictEncoder implements DictEncoder {
            writeTerminalId(ptNode.mTerminalId);
        }
        writeChildrenPosition(ptNode, formatOptions);
        writeShortcuts(ptNode.mShortcutTargets);
    }

    private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,