Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fbc7e61a authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Add Ver4DictEncoder."

parents ffebc9cf a141d8ef
Loading
Loading
Loading
Loading
+15 −4
Original line number Diff line number Diff line
@@ -126,8 +126,14 @@ public class BinaryDictEncoderUtils {
     */
    private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) {
        int size = getNodeHeaderSize(ptNode, options);
        // If terminal, one byte for the frequency
        if (ptNode.isTerminal()) size += FormatSpec.PTNODE_FREQUENCY_SIZE;
        if (ptNode.isTerminal()) {
            // If terminal, one byte for the frequency or four bytes for the terminal id.
            if (options.mHasTerminalId) {
                size += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
            } else {
                size += FormatSpec.PTNODE_FREQUENCY_SIZE;
            }
        }
        size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
        size += getShortcutListSize(ptNode.mShortcutTargets);
        if (null != ptNode.mBigrams) {
@@ -345,7 +351,13 @@ public class BinaryDictEncoderUtils {
                changed = true;
            }
            int nodeSize = getNodeHeaderSize(ptNode, formatOptions);
            if (ptNode.isTerminal()) nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
            if (ptNode.isTerminal()) {
                if (formatOptions.mHasTerminalId) {
                    nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
                } else {
                    nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
                }
            }
            if (formatOptions.mSupportsDynamicUpdate) {
                nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
            } else if (null != ptNode.mChildren) {
@@ -787,7 +799,6 @@ public class BinaryDictEncoderUtils {
                        + FormatSpec.MAX_TERMINAL_FREQUENCY
                        + " : " + ptNode.mFrequency);
            }

            dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
        }
        if (formatOptions.mSupportsDynamicUpdate) {
+13 −1
Original line number Diff line number Diff line
@@ -198,9 +198,12 @@ public final class FormatSpec {

    public static final int MAGIC_NUMBER = 0x9BC13AFE;
    static final int MINIMUM_SUPPORTED_VERSION = 2;
    static final int MAXIMUM_SUPPORTED_VERSION = 3;
    static final int MAXIMUM_SUPPORTED_VERSION = 4;
    static final int NOT_A_VERSION_NUMBER = -1;
    static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
    static final int FIRST_VERSION_WITH_TERMINAL_ID = 4;
    static final int VERSION3 = 3;
    static final int VERSION4 = 4;

    // These options need to be the same numeric values as the one in the native reading code.
    static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
@@ -251,11 +254,17 @@ public final class FormatSpec {
    static final int PTNODE_TERMINATOR_SIZE = 1;
    static final int PTNODE_FLAGS_SIZE = 1;
    static final int PTNODE_FREQUENCY_SIZE = 1;
    static final int PTNODE_TERMINAL_ID_SIZE = 4;
    static final int PTNODE_MAX_ADDRESS_SIZE = 3;
    static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1;
    static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
    static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2;

    // These values are used only by version 4 or later.
    static final String TRIE_FILE_EXTENSION = ".trie";
    static final String FREQ_FILE_EXTENSION = ".freq";
    static final int FREQUENCY_AND_FLAGS_SIZE = 2;

    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
    static final int NO_PARENT_ADDRESS = 0;
    static final int NO_FORWARD_LINK_ADDRESS = 0;
@@ -264,6 +273,7 @@ public final class FormatSpec {
    static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127
    static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767
    static final int MAX_BIGRAMS_IN_A_PTNODE = 10000;
    static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF;

    static final int MAX_TERMINAL_FREQUENCY = 255;
    static final int MAX_BIGRAM_FREQUENCY = 15;
@@ -287,6 +297,7 @@ public final class FormatSpec {
    public static final class FormatOptions {
        public final int mVersion;
        public final boolean mSupportsDynamicUpdate;
        public final boolean mHasTerminalId;
        @UsedForTesting
        public FormatOptions(final int version) {
            this(version, false);
@@ -300,6 +311,7 @@ public final class FormatSpec {
                        + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
            }
            mSupportsDynamicUpdate = supportsDynamicUpdate;
            mHasTerminalId = (version >= FIRST_VERSION_WITH_TERMINAL_ID);
        }
    }

+6 −0
Original line number Diff line number Diff line
@@ -111,6 +111,7 @@ public final class FusionDictionary implements Iterable<Word> {
        ArrayList<WeightedString> mShortcutTargets;
        ArrayList<WeightedString> mBigrams;
        int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
        int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
        PtNodeArray mChildren;
        boolean mIsNotAWord; // Only a shortcut
        boolean mIsBlacklistEntry;
@@ -129,6 +130,7 @@ public final class FusionDictionary implements Iterable<Word> {
                final boolean isNotAWord, final boolean isBlacklistEntry) {
            mChars = chars;
            mFrequency = frequency;
            mTerminalId = frequency;
            mShortcutTargets = shortcutTargets;
            mBigrams = bigrams;
            mChildren = null;
@@ -156,6 +158,10 @@ public final class FusionDictionary implements Iterable<Word> {
            mChildren.mData.add(n);
        }

        public int getTerminalId() {
            return mTerminalId;
        }

        public boolean isTerminal() {
            return NOT_A_TERMINAL != mFrequency;
        }
+2 −2
Original line number Diff line number Diff line
@@ -68,7 +68,7 @@ public class Ver3DictEncoder implements DictEncoder {
    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        if (formatOptions.mVersion > 3) {
        if (formatOptions.mVersion > FormatSpec.VERSION3) {
            throw new UnsupportedFormatException(
                    "The given format options has wrong version number : "
                    + formatOptions.mVersion);
@@ -200,7 +200,7 @@ public class Ver3DictEncoder implements DictEncoder {
            mPosition += shortcutShift;
        }
        final int shortcutByteSize = mPosition - indexOfShortcutByteSize;
        if (shortcutByteSize > 0xFFFF) {
        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
            throw new RuntimeException("Shortcut list too large");
        }
        BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, indexOfShortcutByteSize, shortcutByteSize,
+269 −0
Original line number Diff line number Diff line
/*
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;

/**
 * An implementation of DictEncoder for version 4 binary dictionary.
 */
@UsedForTesting
public class Ver4DictEncoder implements DictEncoder {
    private final File mDictPlacedDir;
    private byte[] mTrieBuf;
    private byte[] mFreqBuf;
    private int mTriePos;
    private OutputStream mTrieOutStream;
    private OutputStream mFreqOutStream;

    @UsedForTesting
    public Ver4DictEncoder(final File dictPlacedDir) {
        mDictPlacedDir = dictPlacedDir;
    }

    private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
            throws FileNotFoundException, IOException {
        final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
        final String filename = header.getId() + "." + header.getVersion();
        final File mDictDir = new File(mDictPlacedDir, filename);
        final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION);
        final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION);
        if (!mDictDir.isDirectory()) {
            if (mDictDir.exists()) mDictDir.delete();
            mDictDir.mkdirs();
        }
        if (!trieFile.exists()) trieFile.createNewFile();
        if (!freqFile.exists()) freqFile.createNewFile();
        mTrieOutStream = new FileOutputStream(trieFile);
        mFreqOutStream = new FileOutputStream(freqFile);
    }

    private void close() throws IOException {
        try {
            if (mTrieOutStream != null) {
                mTrieOutStream.close();
            }
            if (mFreqOutStream != null) {
                mFreqOutStream.close();
            }
        } finally {
            mTrieOutStream = null;
            mFreqOutStream = null;
        }
    }

    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        if (formatOptions.mVersion != FormatSpec.VERSION4) {
            throw new UnsupportedFormatException("File header has a wrong version number : "
                    + formatOptions.mVersion);
        }
        if (!mDictPlacedDir.isDirectory()) {
            throw new UnsupportedFormatException("Given path is not a directory.");
        }

        if (mTrieOutStream == null) {
            openStreams(formatOptions, dict.mOptions);
        }

        BinaryDictEncoderUtils.writeDictionaryHeader(mTrieOutStream, dict, formatOptions);

        MakedictLog.i("Flattening the tree...");
        ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
        int terminalCount = 0;
        for (final PtNodeArray array : flatNodes) {
            for (final PtNode node : array.mData) {
                if (node.isTerminal()) node.mTerminalId = terminalCount++;
            }
        }

        MakedictLog.i("Computing addresses...");
        BinaryDictEncoderUtils.computeAddresses(dict, flatNodes, formatOptions);
        if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
        mTrieBuf = new byte[bufferSize];
        mFreqBuf = new byte[terminalCount * FormatSpec.FREQUENCY_AND_FLAGS_SIZE];

        MakedictLog.i("Writing file...");
        for (PtNodeArray nodeArray : flatNodes) {
            BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray, formatOptions);
        }
        if (MakedictLog.DBG) {
            BinaryDictEncoderUtils.showStatistics(flatNodes);
            MakedictLog.i("has " + terminalCount + " terminals.");
        }
        mTrieOutStream.write(mTrieBuf);
        mFreqOutStream.write(mFreqBuf);

        MakedictLog.i("Done");
        close();
    }

    @Override
    public void setPosition(int position) {
        if (mTrieBuf == null || position < 0 || position >- mTrieBuf.length) return;
        mTriePos = position;
    }

    @Override
    public int getPosition() {
        return mTriePos;
    }

    @Override
    public void writePtNodeCount(int ptNodeCount) {
        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
        // ptNodeCount must fit on one byte or two bytes.
        // Please see comments in FormatSpec
        if (countSize != 1 && countSize != 2) {
            throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
        }
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, ptNodeCount,
                countSize);
    }

    private void writePtNodeFlags(final PtNode ptNode, final int parentAddress,
            final FormatOptions formatOptions) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
                BinaryDictEncoderUtils.makePtNodeFlags(ptNode, mTriePos, childrenPos,
                        formatOptions),
                FormatSpec.PTNODE_FLAGS_SIZE);
    }

    private void writeParentPosition(int parentPos, final PtNode ptNode,
            final FormatOptions formatOptions) {
        if (parentPos != FormatSpec.NO_PARENT_ADDRESS) {
            parentPos -= ptNode.mCachedAddressAfterUpdate;
        }
        mTriePos = BinaryDictEncoderUtils.writeParentAddress(mTrieBuf, mTriePos, parentPos,
                formatOptions);
    }

    private void writeCharacters(final int[] characters, final boolean hasSeveralChars) {
        mTriePos = CharEncoding.writeCharArray(characters, mTrieBuf, mTriePos);
        if (hasSeveralChars) {
            mTrieBuf[mTriePos++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
        }
    }

    private void writeTerminalId(final int terminalId) {
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, terminalId,
                FormatSpec.PTNODE_TERMINAL_ID_SIZE);
    }

    private void writeFrequency(final int frequency, final int terminalId) {
        final int freqPos = terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE;
        BinaryDictEncoderUtils.writeUIntToBuffer(mFreqBuf, freqPos, frequency,
                FormatSpec.FREQUENCY_AND_FLAGS_SIZE);
    }

    private void writeChildrenPosition(PtNode ptNode, FormatOptions formatOptions) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode, formatOptions);
        if (formatOptions.mSupportsDynamicUpdate) {
            mTriePos += BinaryDictEncoderUtils.writeSignedChildrenPosition(mTrieBuf,
                    mTriePos, childrenPos);
        } else {
            mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
                    mTriePos, childrenPos);
        }
    }

    private void writeShortcuts(ArrayList<WeightedString> shortcuts) {
        if (null == shortcuts || shortcuts.isEmpty()) return;

        final int indexOfShortcutByteSize = mTriePos;
        mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
        final Iterator<WeightedString> shortcutIterator = shortcuts.iterator();
        while (shortcutIterator.hasNext()) {
            final WeightedString target = shortcutIterator.next();
            final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags(
                    shortcutIterator.hasNext(),
                    target.mFrequency);
            mTrieBuf[mTriePos++] = (byte)shortcutFlags;
            final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos,
                    target.mWord);
            mTriePos += shortcutShift;
        }
        final int shortcutByteSize = mTriePos - indexOfShortcutByteSize;
        if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) {
            throw new RuntimeException("Shortcut list too large : " + shortcutByteSize);
        }
        BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize,
                shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
    }

    private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) {
        if (bigrams == null) return;

        final Iterator<WeightedString> bigramIterator = bigrams.iterator();
        while (bigramIterator.hasNext()) {
            final WeightedString bigram = bigramIterator.next();
            final PtNode target =
                    FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
            final int addressOfBigram = target.mCachedAddressAfterUpdate;
            final int unigramFrequencyForThisWord = target.mFrequency;
            final int offset = addressOfBigram
                    - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
                    offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
            mTrieBuf[mTriePos++] = (byte) bigramFlags;
            mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
                    mTriePos, Math.abs(offset));
        }
    }

    @Override
    public void writeForwardLinkAddress(int forwardLinkAddress) {
        mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos,
                forwardLinkAddress, FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
    }

    @Override
    public void writePtNode(final PtNode ptNode, final int parentPosition,
            final FormatOptions formatOptions, final FusionDictionary dict) {
        writePtNodeFlags(ptNode, parentPosition, formatOptions);
        writeParentPosition(parentPosition, ptNode, formatOptions);
        writeCharacters(ptNode.mChars, ptNode.hasSeveralChars());
        if (ptNode.isTerminal()) {
            writeTerminalId(ptNode.mTerminalId);
            writeFrequency(ptNode.mFrequency, ptNode.mTerminalId);
        }
        writeChildrenPosition(ptNode, formatOptions);
        writeShortcuts(ptNode.mShortcutTargets);
        writeBigrams(ptNode.mBigrams, dict);
    }
}
Loading