Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a92d0f92 authored by Treehugger Robot's avatar Treehugger Robot Committed by Gerrit Code Review
Browse files

Merge "Fix dicttool build"

parents c42acc53 6a1b3735
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -174,6 +174,9 @@ public final class FormatSpec {
    public static final int VERSION202 = 202;
    // format version for Fava Dictionaries.
    public static final int VERSION_DELIGHT3 = 86736212;
    public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201;
    // Dictionary version used for testing.
    public static final int VERSION4_ONLY_FOR_TESTING = 399;
    public static final int VERSION402 = 402;
    public static final int VERSION403 = 403;
    public static final int VERSION4 = VERSION403;
+677 −0

File added.

Preview size limit exceeded, changes collapsed.

+80 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;

import java.io.File;
import java.util.HashMap;

public class BinaryDictUtils {
    public static final int USE_BYTE_ARRAY = 1;
    public static final int USE_BYTE_BUFFER = 2;

    public static final String TEST_DICT_FILE_EXTENSION = ".testDict";

    public static final FormatSpec.FormatOptions STATIC_OPTIONS =
            new FormatSpec.FormatOptions(FormatSpec.VERSION202);
    public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITHOUT_TIMESTAMP =
            new FormatSpec.FormatOptions(FormatSpec.VERSION4, false /* hasTimestamp */);
    public static final FormatSpec.FormatOptions DYNAMIC_OPTIONS_WITH_TIMESTAMP =
            new FormatSpec.FormatOptions(FormatSpec.VERSION4, true /* hasTimestamp */);

    public static DictionaryOptions makeDictionaryOptions(final String id, final String version,
            final FormatSpec.FormatOptions formatOptions) {
        final DictionaryOptions options = new DictionaryOptions(new HashMap<String, String>());
        options.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, "en_US");
        options.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, id);
        options.mAttributes.put(DictionaryHeader.DICTIONARY_VERSION_KEY, version);
        if (formatOptions.mHasTimestamp) {
            options.mAttributes.put(DictionaryHeader.HAS_HISTORICAL_INFO_KEY,
                    DictionaryHeader.ATTRIBUTE_VALUE_TRUE);
            options.mAttributes.put(DictionaryHeader.USES_FORGETTING_CURVE_KEY,
                    DictionaryHeader.ATTRIBUTE_VALUE_TRUE);
        }
        return options;
    }

    public static File getDictFile(final String name, final String version,
            final FormatOptions formatOptions, final File directory) {
        if (formatOptions.mVersion == FormatSpec.VERSION2
                || formatOptions.mVersion == FormatSpec.VERSION201
                || formatOptions.mVersion == FormatSpec.VERSION202) {
            return new File(directory, name + "." + version + TEST_DICT_FILE_EXTENSION);
        } else if (formatOptions.mVersion == FormatSpec.VERSION4) {
            return new File(directory, name + "." + version);
        } else {
            throw new RuntimeException("the format option has a wrong version : "
                    + formatOptions.mVersion);
        }
    }

    public static DictEncoder getDictEncoder(final File file, final FormatOptions formatOptions) {
        if (formatOptions.mVersion == FormatSpec.VERSION4) {
            if (!file.isDirectory()) {
                file.mkdir();
            }
            return new Ver4DictEncoder(file);
        } else if (formatOptions.mVersion == FormatSpec.VERSION202) {
            return new Ver2DictEncoder(file, Ver2DictEncoder.CODE_POINT_TABLE_OFF);
        } else {
            throw new RuntimeException("The format option has a wrong version : "
                    + formatOptions.mVersion);
        }
    }
}
+279 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;

/**
 * An implementation of DictEncoder for version 2 binary dictionary.
 */
@UsedForTesting
public class Ver2DictEncoder implements DictEncoder {

    private final File mDictFile;
    private OutputStream mOutStream;
    private byte[] mBuffer;
    private int mPosition;
    private final int mCodePointTableMode;
    public static final int CODE_POINT_TABLE_OFF = 0;
    public static final int CODE_POINT_TABLE_ON = 1;

    @UsedForTesting
    public Ver2DictEncoder(final File dictFile, final int codePointTableMode) {
        mDictFile = dictFile;
        mOutStream = null;
        mBuffer = null;
        mCodePointTableMode = codePointTableMode;
    }

    // This constructor is used only by BinaryDictOffdeviceUtilsTests.
    // If you want to use this in the production code, you should consider keeping consistency of
    // the interface of Ver3DictDecoder by using factory.
    @UsedForTesting
    public Ver2DictEncoder(final OutputStream outStream) {
        mDictFile = null;
        mOutStream = outStream;
        mCodePointTableMode = CODE_POINT_TABLE_OFF;
    }

    private void openStream() throws FileNotFoundException {
        mOutStream = new FileOutputStream(mDictFile);
    }

    private void close() throws IOException {
        if (mOutStream != null) {
            mOutStream.close();
            mOutStream = null;
        }
    }

    // Package for testing
    static CodePointTable makeCodePointTable(final FusionDictionary dict) {
        final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
        for (final WordProperty word : dict) {
            // Store per code point occurrence
            final String wordString = word.mWord;
            for (int i = 0; i < wordString.length(); ++i) {
                final int codePoint = Character.codePointAt(wordString, i);
                if (codePointOccurrenceCounts.containsKey(codePoint)) {
                    codePointOccurrenceCounts.put(codePoint,
                            codePointOccurrenceCounts.get(codePoint) + 1);
                } else {
                    codePointOccurrenceCounts.put(codePoint, 1);
                }
            }
        }
        final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
                new ArrayList<>(codePointOccurrenceCounts.entrySet());
        // Descending order sort by occurrence (value side)
        Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
            @Override
            public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
                if (a.getValue() != b.getValue()) {
                    return b.getValue().compareTo(a.getValue());
                }
                return b.getKey().compareTo(a.getKey());
            }
        });
        int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
        // Temporary map for writing of nodes
        final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
        for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
            // Put a relation from the original code point to the one byte code.
            codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
            if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
                break;
            }
        }
        // codePointToOneByteCodeMap for writing the trie
        // codePointOccurrenceArray for writing the header
        return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
    }

    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        // We no longer support anything but the latest version of v2.
        if (formatOptions.mVersion != FormatSpec.VERSION202) {
            throw new UnsupportedFormatException(
                    "The given format options has wrong version number : "
                    + formatOptions.mVersion);
        }

        if (mOutStream == null) {
            openStream();
        }

        // Make code point conversion table ordered by occurrence of code points
        // Version 201 or later have codePointTable
        final CodePointTable codePointTable;
        if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion
                < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
            codePointTable = new CodePointTable();
        } else {
            codePointTable = makeCodePointTable(dict);
        }

        BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
                codePointTable.mCodePointOccurrenceArray);

        // Addresses are limited to 3 bytes, but since addresses can be relative to each node
        // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
        // the order of the PtNode arrays becomes a quite complicated problem, because though the
        // dictionary itself does not have a size limit, each node array must still be within 16MB
        // of all its children and parents. As long as this is ensured, the dictionary file may
        // grow to any size.

        // Leave the choice of the optimal node order to the flattenTree function.
        MakedictLog.i("Flattening the tree...");
        ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);

        MakedictLog.i("Computing addresses...");
        BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
                codePointTable.mCodePointToOneByteCodeMap);
        MakedictLog.i("Checking PtNode array...");
        if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

        // Create a buffer that matches the final dictionary size.
        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
        mBuffer = new byte[bufferSize];

        MakedictLog.i("Writing file...");

        for (PtNodeArray nodeArray : flatNodes) {
            BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
                    codePointTable.mCodePointToOneByteCodeMap);
        }
        if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
        mOutStream.write(mBuffer, 0, mPosition);

        MakedictLog.i("Done");
        close();
    }

    @Override
    public void setPosition(final int position) {
        if (mBuffer == null || position < 0 || position >= mBuffer.length) return;
        mPosition = position;
    }

    @Override
    public int getPosition() {
        return mPosition;
    }

    @Override
    public void writePtNodeCount(final int ptNodeCount) {
        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
        if (countSize != 1 && countSize != 2) {
            throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
        }
        final int encodedPtNodeCount = (countSize == 2) ?
                (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
        mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
                countSize);
    }

    private void writePtNodeFlags(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
                codePointToOneByteCodeMap);
        mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
                BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
                FormatSpec.PTNODE_FLAGS_SIZE);
    }

    private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
                codePointToOneByteCodeMap);
        if (hasSeveralChars) {
            mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
        }
    }

    private void writeFrequency(final int frequency) {
        if (frequency >= 0) {
            mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
                    FormatSpec.PTNODE_FREQUENCY_SIZE);
        }
    }

    private void writeChildrenPosition(final PtNode ptNode,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
                codePointToOneByteCodeMap);
        mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
                childrenPos);
    }

    /**
     * Write a bigram attributes list to mBuffer.
     *
     * @param bigrams the bigram attributes list.
     * @param dict the dictionary the node array is a part of (for relative offsets).
     */
    private void writeBigrams(final ArrayList<WeightedString> bigrams,
            final FusionDictionary dict) {
        if (bigrams == null) return;

        final Iterator<WeightedString> bigramIterator = bigrams.iterator();
        while (bigramIterator.hasNext()) {
            final WeightedString bigram = bigramIterator.next();
            final PtNode target =
                    FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
            final int addressOfBigram = target.mCachedAddressAfterUpdate;
            final int unigramFrequencyForThisWord = target.getProbability();
            final int offset = addressOfBigram
                    - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
                    offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
            mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
                    FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
                    Math.abs(offset));
        }
    }

    @Override
    public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
            final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
        writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
        writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
        writeFrequency(ptNode.getProbability());
        writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
        writeBigrams(ptNode.mBigrams, dict);
    }
}
+133 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2013 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.BinaryDictionary;
import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.NgramContext;
import com.android.inputmethod.latin.common.LocaleUtils;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;

/**
 * An implementation of DictEncoder for version 4 binary dictionary.
 */
@UsedForTesting
public class Ver4DictEncoder implements DictEncoder {
    private final File mDictPlacedDir;

    @UsedForTesting
    public Ver4DictEncoder(final File dictPlacedDir) {
        mDictPlacedDir = dictPlacedDir;
    }

    // TODO: This builds a FusionDictionary first and iterates it to add words to the binary
    // dictionary. However, it is possible to just add words directly to the binary dictionary
    // instead.
    // In the long run, when we stop supporting version 2, FusionDictionary will become deprecated
    // and we can remove it. Then we'll be able to just call BinaryDictionary directly.
    @Override
    public void writeDictionary(FusionDictionary dict, FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
        if (formatOptions.mVersion != FormatSpec.VERSION4) {
            throw new UnsupportedFormatException("File header has a wrong version number : "
                    + formatOptions.mVersion);
        }
        if (!mDictPlacedDir.isDirectory()) {
            throw new UnsupportedFormatException("Given path is not a directory.");
        }
        if (!BinaryDictionaryUtils.createEmptyDictFile(mDictPlacedDir.getAbsolutePath(),
                FormatSpec.VERSION4, LocaleUtils.constructLocaleFromString(
                dict.mOptions.mAttributes.get(DictionaryHeader.DICTIONARY_LOCALE_KEY)),
                dict.mOptions.mAttributes)) {
            throw new IOException("Cannot create dictionary file : "
                + mDictPlacedDir.getAbsolutePath());
        }
        final BinaryDictionary binaryDict = new BinaryDictionary(mDictPlacedDir.getAbsolutePath(),
                0l, mDictPlacedDir.length(), true /* useFullEditDistance */,
                LocaleUtils.constructLocaleFromString(dict.mOptions.mAttributes.get(
                        DictionaryHeader.DICTIONARY_LOCALE_KEY)),
                Dictionary.TYPE_USER /* Dictionary type. Does not matter for us */,
                true /* isUpdatable */);
        if (!binaryDict.isValidDictionary()) {
            // Somehow createEmptyDictFile returned true, but the file was not created correctly
            throw new IOException("Cannot create dictionary file");
        }
        for (final WordProperty wordProperty : dict) {
            if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(),
                    wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
                    wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
                MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord);
            }
            if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
                if (!binaryDict.flushWithGC()) {
                    MakedictLog.e("Cannot flush dict with GC.");
                    return;
                }
            }
        }
        for (final WordProperty word0Property : dict) {
            if (!word0Property.mHasNgrams) continue;
            // TODO: Support ngram.
            for (final WeightedString word1 : word0Property.getBigrams()) {
                final NgramContext ngramContext =
                        new NgramContext(new NgramContext.WordInfo(word0Property.mWord));
                if (!binaryDict.addNgramEntry(ngramContext, word1.mWord,
                        word1.getProbability(), 0 /* timestamp */)) {
                    MakedictLog.e("Cannot add n-gram entry for "
                            + ngramContext + " -> " + word1.mWord);
                    return;
                }
                if (binaryDict.needsToRunGC(true /* mindsBlockByGC */)) {
                    if (!binaryDict.flushWithGC()) {
                        MakedictLog.e("Cannot flush dict with GC.");
                        return;
                    }
                }
            }
        }
        if (!binaryDict.flushWithGC()) {
            MakedictLog.e("Cannot flush dict with GC.");
            return;
        }
        binaryDict.close();
    }

    @Override
    public void setPosition(int position) {
    }

    @Override
    public int getPosition() {
        return 0;
    }

    @Override
    public void writePtNodeCount(int ptNodeCount) {
    }

    @Override
    public void writePtNode(PtNode ptNode, FusionDictionary dict,
            HashMap<Integer, Integer> codePointToOneByteCodeMap) {
    }
}
Loading