Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a141d8ef authored by Yuichiro Hanada's avatar Yuichiro Hanada
Browse files

Add Ver4DictEncoder.

Bug: 9618601
Change-Id: I161d2845906f07c1251deb8005fdffe49c5b7940
parent 610f3eb4
Loading
Loading
Loading
Loading
+15 −4
Original line number Diff line number Diff line
@@ -126,8 +126,14 @@ public class BinaryDictEncoderUtils {
     */
    private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) {
        int size = getNodeHeaderSize(ptNode, options);
        // If terminal, one byte for the frequency
        if (ptNode.isTerminal()) size += FormatSpec.PTNODE_FREQUENCY_SIZE;
        if (ptNode.isTerminal()) {
            // If terminal, one byte for the frequency or four bytes for the terminal id.
            if (options.mHasTerminalId) {
                size += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
            } else {
                size += FormatSpec.PTNODE_FREQUENCY_SIZE;
            }
        }
        size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
        size += getShortcutListSize(ptNode.mShortcutTargets);
        if (null != ptNode.mBigrams) {
@@ -345,7 +351,13 @@ public class BinaryDictEncoderUtils {
                changed = true;
            }
            int nodeSize = getNodeHeaderSize(ptNode, formatOptions);
            if (ptNode.isTerminal()) nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
            if (ptNode.isTerminal()) {
                if (formatOptions.mHasTerminalId) {
                    nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
                } else {
                    nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
                }
            }
            if (formatOptions.mSupportsDynamicUpdate) {
                nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
            } else if (null != ptNode.mChildren) {
@@ -787,7 +799,6 @@ public class BinaryDictEncoderUtils {
                        + FormatSpec.MAX_TERMINAL_FREQUENCY
                        + " : " + ptNode.mFrequency);
            }

            dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
        }
        if (formatOptions.mSupportsDynamicUpdate) {
+13 −1
Original line number Diff line number Diff line
@@ -198,9 +198,12 @@ public final class FormatSpec {

    public static final int MAGIC_NUMBER = 0x9BC13AFE;
    static final int MINIMUM_SUPPORTED_VERSION = 2;
    static final int MAXIMUM_SUPPORTED_VERSION = 3;
    static final int MAXIMUM_SUPPORTED_VERSION = 4;
    static final int NOT_A_VERSION_NUMBER = -1;
    static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
    static final int FIRST_VERSION_WITH_TERMINAL_ID = 4;
    static final int VERSION3 = 3;
    static final int VERSION4 = 4;

    // These options need to be the same numeric values as the one in the native reading code.
    static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
@@ -251,11 +254,17 @@ public final class FormatSpec {
    static final int PTNODE_TERMINATOR_SIZE = 1;
    static final int PTNODE_FLAGS_SIZE = 1;
    static final int PTNODE_FREQUENCY_SIZE = 1;
    static final int PTNODE_TERMINAL_ID_SIZE = 4;
    static final int PTNODE_MAX_ADDRESS_SIZE = 3;
    static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1;
    static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
    static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2;

    // These values are used only by version 4 or later.
    static final String TRIE_FILE_EXTENSION = ".trie";
    static final String FREQ_FILE_EXTENSION = ".freq";
    static final int FREQUENCY_AND_FLAGS_SIZE = 2;

    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
    static final int NO_PARENT_ADDRESS = 0;
    static final int NO_FORWARD_LINK_ADDRESS = 0;
@@ -264,6 +273,7 @@ public final class FormatSpec {
    static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127
    static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767
    static final int MAX_BIGRAMS_IN_A_PTNODE = 10000;
    static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF;

    static final int MAX_TERMINAL_FREQUENCY = 255;
    static final int MAX_BIGRAM_FREQUENCY = 15;
@@ -287,6 +297,7 @@ public final class FormatSpec {
    public static final class FormatOptions {
        public final int mVersion;
        public final boolean mSupportsDynamicUpdate;
        public final boolean mHasTerminalId;
        @UsedForTesting
        public FormatOptions(final int version) {
            this(version, false);
@@ -300,6 +311,7 @@ public final class FormatSpec {
                        + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
            }
            mSupportsDynamicUpdate = supportsDynamicUpdate;
            mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
        }
    }

+6 −0
Original line number Diff line number Diff line
@@ -111,6 +111,7 @@ public final class FusionDictionary implements Iterable<Word> {
        ArrayList<WeightedString> mShortcutTargets;
        ArrayList<WeightedString> mBigrams;
        int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
        int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
        PtNodeArray mChildren;
        boolean mIsNotAWord; // Only a shortcut
        boolean mIsBlacklistEntry;
@@ -129,6 +130,7 @@ public final class FusionDictionary implements Iterable<Word> {
                final boolean isNotAWord, final boolean isBlacklistEntry) {
            mChars = chars;
            mFrequency = frequency;
            mTerminalId = frequency;
            mShortcutTargets = shortcutTargets;
            mBigrams = bigrams;
            mChildren = null;
@@ -156,6 +158,10 @@ public final class FusionDictionary implements Iterable<Word> {
            mChildren.mData.add(n);
        }

        public int getTerminalId() {
            return mTerminalId;
        }

        public boolean isTerminal() {
            return NOT_A_TERMINAL != mFrequency;
        }
+2 −2
Original line number Diff line number Diff line
@@ -68,7 +68,7 @@ public class Ver3DictEncoder implements DictEncoder {
    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        if (formatOptions.mVersion > 3) {
        if (formatOptions.mVersion > FormatSpec.VERSION3) {
            throw new UnsupportedFormatException(
                    "The given format options has wrong version number : "
                    + formatOptions.mVersion);
@@ -200,7 +200,7 @@ public class Ver3DictEncoder implements DictEncoder {
            mPosition += shortcutShift;
        }
        final int shortcutByteSize = mPosition - indexOfShortcutByteSize;
        if (shortcutByteSize > 0xFFFF) {
        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
            throw new RuntimeException("Shortcut list too large");
        }
        BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize,
+269 −0
Original line number Diff line number Diff line
/*
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;

/**
 * An implementation of DictEncoder for version 4 binary dictionary.
 */
@UsedForTesting
public class Ver4DictEncoder implements DictEncoder {
    private final File mDictPlacedDir;
    private byte[] mTrieBuf;
    private byte[] mFreqBuf;
    private int mTriePos;
    private OutputStream mTrieOutStream;
    private OutputStream mFreqOutStream;

    @UsedForTesting
    public Ver4DictEncoder(final File dictPlacedDir) {
        mDictPlacedDir = dictPlacedDir;
    }

    private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
            throws FileNotFoundException, IOException {
        final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
        final String filename = header.getId() + "." + header.getVersion();
        final File mDictDir = new File(mDictPlacedDir, filename);
        final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION);
        final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION);
        if (!mDictDir.isDirectory()) {
            if (mDictDir.exists()) mDictDir.delete();
            mDictDir.mkdirs();
        }
        if (!trieFile.exists()) trieFile.createNewFile();
        if (!freqFile.exists()) freqFile.createNewFile();
        mTrieOutStream = new FileOutputStream(trieFile);
        mFreqOutStream = new FileOutputStream(freqFile);
    }

    private void close() throws IOException {
        try {
            if (mTrieOutStream != null) {
                mTrieOutStream.close();
            }
            if (mFreqOutStream != null) {
                mFreqOutStream.close();
            }
        } finally {
            mTrieOutStream = null;
            mFreqOutStream = null;
        }
    }

    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        if (formatOptions.mVersion != FormatSpec.VERSION4) {
            throw new UnsupportedFormatException("File header has a wrong version number : "
                    + formatOptions.mVersion);
        }
        if (!mDictPlacedDir.isDirectory()) {
            throw new UnsupportedFormatException("Given path is not a directory.");
        }

        if (mTrieOutStream == null) {
            openStreams(formatOptions, dict.mOptions);
        }

        BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, formatOptions);

        MakedictLog.i("Flattening the tree...");
        ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
        int terminalCount = 0;
        for (final PtNodeArray array : flatNodes) {
            for (final PtNode node : array.mData) {
                if (node.isTerminal()) node.mTerminalId = terminalCount++;
            }
        }

        MakedictLog.i("Computing addresses...");
        BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions);
        if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
        mTrieBuf = new byte[bufferSize];
        mFreqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE];

        MakedictLog.i("Writing file...");
        for (PtNodeArray nodeArray : flatNodes) {
            BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions);
        }
        if (MakedictLog.DBG) {
            BinaryDictEncoderUtils.showStatistics(flatNodes);
            MakedictLog.i("has " + terminalCount + " terminals.");
        }
        mTrieOutStream.write(mTrieBuf);
        mFreqOutStream.write(mFreqBuf);

        MakedictLog.i("Done");
        close();
    }

    @Override
    public void setPosition(int position) {
        if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return;
        mTriePos = position;
    }

    @Override
    public int getPosition() {
        return mTriePos;
    }

    @Override
    public void writePtNodeCount(int ptNodeCount) {
        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
        // ptNodeCount must fit on one byte or two bytes.
        // Please see comments in FormatSpec
        if (countSize != 1 && countSize != 2) {
            throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
        }
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, ptNodeCount,
                countSize);
    }

    private void writePtNodeFlags(final PtNode ptNode, final int parentAddress,
            final FormatOptions formatOptions) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
                BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos,
                        formatOptions),
                FormatSpec.PTNODE_FLAGS_SIZE);
    }

    private void writeParentPosition(int parentPos, final PtNode ptNode,
            final FormatOptions formatOptions) {
        if (parentPos != FormatSpec.NO_PARENT_ADDRESS) {
            parentPos -= ptNode.mCachedAddressAfterUpdate;
        }
        mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos,
                formatOptions);
    }

    private void writeCharacters(final int[] characters, final boolean hasSeveralChars) {
        mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos);
        if (hasSeveralChars) {
            mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
        }
    }

    private void writeTerminalId(final int terminalId) {
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId,
                FormatSpec.PTNODE_TERMINAL_ID_SIZE);
    }

    private void writeFrequency(final int frequency, final int terminalId) {
        final int freqPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE;
        BinaryDictEncoderUtils.writeUIntToBuffer(mFreqBuf, freqPos, frequency,
                FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
    }

    private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
        if (formatOptions.mSupportsDynamicUpdate) {
            mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf,
                    mTriePos, childrenPos);
        } else {
            mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
                    mTriePos, childrenPos);
        }
    }

    private void writeShortcuts(ArrayList<WeightedString> shortcuts) {
        if (null == shortcuts || shortcuts.isEmpty()) return;

        final int indexOfShortcutByteSize = mTriePos;
        mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
        final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
        while (shortcutIterator.hasNext()) {
            final WeightedString target = shortcutIterator.next();
            final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
                    shortcutIterator.hasNext(),
                    target.mFrequency);
            mTrieBuf[mTriePos++] = (byte)shortcutFlags;
            final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos,
                    target.mWord);
            mTriePos += shortcutShift;
        }
        final int shortcutByteSize = mTriePos - indexOfShortcutByteSize;
        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
            throw new RuntimeException("Shortcut list too large : " + shortcutByteSize);
        }
        BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize,
                shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
    }

    private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) {
        if (bigrams == null) return;

        final Iterator<WeightedString> bigramIterator = bigrams.iterator();
        while (bigramIterator.hasNext()) {
            final WeightedString bigram = bigramIterator.next();
            final PtNode target =
                    FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
            final int addressOfBigram = target.mCachedAddressAfterUpdate;
            final int unigramFrequencyForThisWord = target.mFrequency;
            final int offset = addressOfBigram
                    - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
                    offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
            mTrieBuf[mTriePos++] = (byte) bigramFlags;
            mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
                    mTriePos, Math.abs(offset));
        }
    }

    @Override
    public void writeForwardLinkAddress(int forwardLinkAddress) {
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
                forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
    }

    @Override
    public void writePtNode(final PtNode ptNode, final int parentPosition,
            final FormatOptions formatOptions, final FusionDictionary dict) {
        writePtNodeFlags(ptNode, parentPosition, formatOptions);
        writeParentPosition(parentPosition, ptNode, formatOptions);
        writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
        if (ptNode.isTerminal()) {
            writeTerminalId(ptNode.mTerminalId);
            writeFrequency(ptNode.mFrequency, ptNode.mTerminalId);
        }
        writeChildrenPosition(ptNode, formatOptions);
        writeShortcuts(ptNode.mShortcutTargets);
        writeBigrams(ptNode.mBigrams, dict);
    }
}
Loading