Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d5d0057a authored by Akifumi Yoshimoto's avatar Akifumi Yoshimoto Committed by Android Git Automerger
Browse files

am 5e828b2c: Merge "Create a code point table based on occurrence counts."

* commit '5e828b2c':
  Create a code point table based on occurrence counts.
parents ebc2e6f3 5e828b2c
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ public final class DictionaryHeader {
    public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
    public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
    public static final String ATTRIBUTE_VALUE_TRUE = "1";
    public static final String CODE_POINT_TABLE_KEY = "codePointTable";

    public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
            final FormatOptions formatOptions) throws UnsupportedFormatException {
+2 −0
Original line number Diff line number Diff line
@@ -237,6 +237,8 @@ public final class FormatSpec {
    static final int UINT16_MAX = 0xFFFF;
    static final int UINT24_MAX = 0xFFFFFF;
    static final int MSB8 = 0x80;
    static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
    static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;

    /**
     * Options about file format.
+19 −1
Original line number Diff line number Diff line
@@ -27,6 +27,8 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;

/**
 * Encodes binary files for a FusionDictionary.
@@ -791,10 +793,12 @@ public class BinaryDictEncoderUtils {
     * @param destination the stream to write the file header to.
     * @param dict the dictionary to write.
     * @param formatOptions file format options.
     * @param codePointOccurrenceArray code points ordered by occurrence count.
     * @return the size of the header.
     */
    /* package */ static int writeDictionaryHeader(final OutputStream destination,
            final FusionDictionary dict, final FormatOptions formatOptions)
            final FusionDictionary dict, final FormatOptions formatOptions,
            final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray)
                    throws IOException, UnsupportedFormatException {
        final int version = formatOptions.mVersion;
        if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
@@ -833,6 +837,9 @@ public class BinaryDictEncoderUtils {
            CharEncoding.writeString(headerBuffer, key);
            CharEncoding.writeString(headerBuffer, value);
        }

        // TODO: Write out the code point table.

        final int size = headerBuffer.size();
        final byte[] bytes = headerBuffer.toByteArray();
        // Write out the header size.
@@ -845,4 +852,15 @@ public class BinaryDictEncoderUtils {
        headerBuffer.close();
        return size;
    }

    static final class CodePointTable {
        final HashMap<Integer, Integer> mCodePointToOneByteCodeMap;
        final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray;

        CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap,
                final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) {
            mCodePointToOneByteCodeMap = codePointToOneByteCodeMap;
            mCodePointOccurrenceArray = codePointOccurrenceArray;
        }
    }
}
+51 −1
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
@@ -28,7 +29,11 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;

/**
 * An implementation of DictEncoder for version 2 binary dictionary.
@@ -73,6 +78,46 @@ public class Ver2DictEncoder implements DictEncoder {
        }
    }

    // Package for testing
    static CodePointTable makeCodePointTable(final FusionDictionary dict) {
        final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
        for (final WordProperty word : dict) {
            // Store per code point occurrence
            final String wordString = word.mWord;
            for (int i = 0; i < wordString.length(); ++i) {
                final int codePoint = Character.codePointAt(wordString, i);
                if (codePointOccurrenceCounts.containsKey(codePoint)) {
                    codePointOccurrenceCounts.put(codePoint,
                            codePointOccurrenceCounts.get(codePoint) + 1);
                } else {
                    codePointOccurrenceCounts.put(codePoint, 1);
                }
            }
        }
        final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
                new ArrayList<>(codePointOccurrenceCounts.entrySet());
        // Descending order sort by occurrence (value side)
        Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
            @Override
            public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
                return b.getValue().compareTo(a.getValue());
            }
        });
        int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
        // Temporary map for writing of nodes
        final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
        for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
            // Put a relation from the original code point to the one byte code.
            codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
            if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
                break;
            }
        }
        // codePointToOneByteCodeMap for writing the trie
        // codePointOccurrenceArray for writing the header
        return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
    }

    @Override
    public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
            throws IOException, UnsupportedFormatException {
@@ -85,7 +130,12 @@ public class Ver2DictEncoder implements DictEncoder {
        if (mOutStream == null) {
            openStream();
        }
        BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions);

        // Make code point conversion table ordered by occurrence of code points
        final CodePointTable codePointTable = makeCodePointTable(dict);

        BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
                codePointTable.mCodePointOccurrenceArray);

        // Addresses are limited to 3 bytes, but since addresses can be relative to each node
        // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
+91 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.latin.makedict;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;

import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;

import android.test.AndroidTestCase;
import android.test.suitebuilder.annotation.LargeTest;
import android.util.Log;

/**
 * Unit tests for Ver2DictEncoder
 */
@LargeTest
public class Ver2DictEncoderTests extends AndroidTestCase {
    private static final String TAG = Ver2DictEncoderTests.class.getSimpleName();
    private static final int UNIGRAM_FREQ = 10;

    public void testCodePointTable() {
        final String[] wordSource = {"words", "used", "for", "testing", "a", "code point", "table"};
        final List<String> words = Arrays.asList(wordSource);
        final String correctCodePointTable = "eotdsanirfg bclwup";
        final String correctCodePointOccurrenceArrayString =
                "10141164111411531003110297210521142103111911171108198199132111211021";
        final String correctCodePointExpectedMapString = "323433363538373940494147454644424348";
        final String dictName = "codePointTableTest";
        final String dictVersion = Long.toString(System.currentTimeMillis());

        final FormatSpec.FormatOptions formatOptions =
                new FormatSpec.FormatOptions(FormatSpec.VERSION2);
        final FusionDictionary sourcedict = new FusionDictionary(new PtNodeArray(),
                BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions));
        addUnigrams(sourcedict, words, null /* shortcutMap */);
        final CodePointTable codePointTable = Ver2DictEncoder.makeCodePointTable(sourcedict);

        // Check if mCodePointOccurrenceArray is correct
        final StringBuilder codePointOccurrenceArrayString = new StringBuilder();
        for (Entry<Integer, Integer> entry : codePointTable.mCodePointOccurrenceArray) {
            codePointOccurrenceArrayString.append(entry.getKey());
            codePointOccurrenceArrayString.append(entry.getValue());
        }
        assertEquals(codePointOccurrenceArrayString.toString(),
                correctCodePointOccurrenceArrayString);

        // Check if mCodePointToOneByteCodeMap is correct
        final StringBuilder codePointExpectedMapString = new StringBuilder();
        for (int i = 0; i < correctCodePointTable.length(); ++i) {
            codePointExpectedMapString.append(codePointTable.mCodePointToOneByteCodeMap.get(
                    correctCodePointTable.codePointAt(i)));
        }
        assertEquals(codePointExpectedMapString.toString(), correctCodePointExpectedMapString);
    }

    /**
     * Adds unigrams to the dictionary.
     */
    private void addUnigrams(final FusionDictionary dict, final List<String> words,
            final HashMap<String, List<String>> shortcutMap) {
        for (final String word : words) {
            final ArrayList<WeightedString> shortcuts = new ArrayList<>();
            if (shortcutMap != null && shortcutMap.containsKey(word)) {
                for (final String shortcut : shortcutMap.get(word)) {
                    shortcuts.add(new WeightedString(shortcut, UNIGRAM_FREQ));
                }
            }
            dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
                    (shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
        }
    }
}