Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 54717534 authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Add getTerminalPosition." into jb-mr1-dev

parents 81d2e937 d36245fa
Loading
Loading
Loading
Loading
+67 −1
Original line number Diff line number Diff line
@@ -16,10 +16,11 @@

package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.latin.makedict.BinaryDictInputOutput;
import com.android.inputmethod.latin.Constants;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;

import java.io.IOException;
import java.util.ArrayList;
@@ -124,4 +125,69 @@ public class BinaryDictIOUtils {
        readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
                header.mFormatOptions);
    }

    /**
     * Gets the address of the last CharGroup of the exact matching word in the dictionary.
     * If no match is found, returns NOT_VALID_WORD.
     *
     * @param buffer the buffer to read.
     * @param word the word we search for.
     * @return the address of the terminal node.
     * @throws IOException
     * @throws UnsupportedFormatException
     */
    public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
            final String word) throws IOException, UnsupportedFormatException {
        if (word == null) return FormatSpec.NOT_VALID_WORD;
        if (buffer.position() != 0) buffer.position(0);

        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
        int wordPos = 0;
        final int wordLen = word.codePointCount(0, word.length());
        for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
            int groupOffset = buffer.position() - header.mHeaderSize;
            final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
            groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);

            for (int i = 0; i < charGroupCount; ++i) {
                final int charGroupPos = buffer.position();
                final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
                        buffer.position(), header.mFormatOptions);
                boolean same = true;
                for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
                        p < currentInfo.mCharacters.length;
                        ++p, j = word.offsetByCodePoints(j, 1)) {
                    if (wordPos + p >= wordLen
                            || word.codePointAt(j) != currentInfo.mCharacters[p]) {
                        same = false;
                        break;
                    }
                }

                if (same) {
                    if (wordPos + currentInfo.mCharacters.length == wordLen) {
                        if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
                            return FormatSpec.NOT_VALID_WORD;
                        } else {
                            return charGroupPos;
                        }
                    }
                    wordPos += currentInfo.mCharacters.length;
                    if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
                        return FormatSpec.NOT_VALID_WORD;
                    }
                    buffer.position(currentInfo.mChildrenAddress);
                    break;
                }
                groupOffset = currentInfo.mEndAddress;

                // not found
                if (i >= charGroupCount - 1) {
                    return FormatSpec.NOT_VALID_WORD;
                }
            }
        }
        return FormatSpec.NOT_VALID_WORD;
    }
}
+3 −2
Original line number Diff line number Diff line
@@ -1242,8 +1242,9 @@ public class BinaryDictInputOutput {
     * @param formatOptions file format options.
     * @return the word, as a string.
     */
    private static String getWordAtAddress(final FusionDictionaryBufferInterface buffer,
            final int headerSize, final int address, final FormatOptions formatOptions) {
    /* packages for tests */ static String getWordAtAddress(
            final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
            final FormatOptions formatOptions) {
        final String cachedString = wordCache.get(address);
        if (null != cachedString) return cachedString;

+3 −0
Original line number Diff line number Diff line
@@ -207,6 +207,9 @@ public final class FormatSpec {
    static final int MAX_TERMINAL_FREQUENCY = 255;
    static final int MAX_BIGRAM_FREQUENCY = 15;

    // This option needs to be the same numeric value as the one in binary_format.h.
    static final int NOT_VALID_WORD = -99;

    /**
     * Options about file format.
     */
+90 −1
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.latin.CollectionUtils;
import com.android.inputmethod.latin.UserHistoryDictIOUtils;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
import com.android.inputmethod.latin.makedict.FormatSpec;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
@@ -475,4 +475,93 @@ public class BinaryDictIOTests extends AndroidTestCase {
            Log.d(TAG, result);
        }
    }

    // Tests for getTerminalPosition
    private String getWordFromBinary(final FusionDictionaryBufferInterface buffer,
            final int address) {
        if (buffer.position() != 0) buffer.position(0);

        FileHeader header = null;
        try {
            header = BinaryDictInputOutput.readHeader(buffer);
        } catch (IOException e) {
            return null;
        } catch (UnsupportedFormatException e) {
            return null;
        }
        if (header == null) return null;
        return BinaryDictInputOutput.getWordAtAddress(buffer, header.mHeaderSize,
                address - header.mHeaderSize, header.mFormatOptions);
    }

    private long runGetTerminalPosition(final FusionDictionaryBufferInterface buffer,
            final String word, int index, boolean contained) {
        final int expectedFrequency = (UNIGRAM_FREQ + index) % 255;
        long diff = -1;
        int position = -1;
        try {
            final long now = System.nanoTime();
            position = BinaryDictIOUtils.getTerminalPosition(buffer, word);
            diff = System.nanoTime() - now;
        } catch (IOException e) {
            Log.e(TAG, "IOException while getTerminalPosition: " + e);
        } catch (UnsupportedFormatException e) {
            Log.e(TAG, "UnsupportedFormatException while getTermianlPosition: " + e);
        }

        assertEquals(FormatSpec.NOT_VALID_WORD != position, contained);
        if (contained) assertEquals(getWordFromBinary(buffer, position), word);
        return diff;
    }

    public void testGetTerminalPosition() {
        File file = null;
        try {
            file = File.createTempFile("runReadUnigrams", ".dict");
        } catch (IOException e) {
            // do nothing
        }
        assertNotNull(file);

        final FusionDictionary dict = new FusionDictionary(new Node(),
                new FusionDictionary.DictionaryOptions(
                        new HashMap<String, String>(), false, false));
        addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */);
        timeWritingDictToFile(file, dict, VERSION3_WITH_LINKEDLIST_NODE);

        final FusionDictionaryBufferInterface buffer = getBuffer(file, USE_BYTE_ARRAY);

        try {
            // too long word
            final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
            assertEquals(FormatSpec.NOT_VALID_WORD,
                    BinaryDictIOUtils.getTerminalPosition(buffer, longWord));

            // null
            assertEquals(FormatSpec.NOT_VALID_WORD,
                    BinaryDictIOUtils.getTerminalPosition(buffer, null));

            // empty string
            assertEquals(FormatSpec.NOT_VALID_WORD,
                    BinaryDictIOUtils.getTerminalPosition(buffer, ""));
        } catch (IOException e) {
        } catch (UnsupportedFormatException e) {
        }

        // Test a word that is contained within the dictionary.
        long sum = 0;
        for (int i = 0; i < sWords.size(); ++i) {
            final long time = runGetTerminalPosition(buffer, sWords.get(i), i, true);
            sum += time == -1 ? 0 : time;
        }
        Log.d(TAG, "per a search : " + (((double)sum) / sWords.size() / 1000000));

        // Test a word that isn't contained within the dictionary.
        final Random random = new Random((int)System.currentTimeMillis());
        for (int i = 0; i < 1000; ++i) {
            final String word = generateWord(random.nextInt());
            if (sWords.indexOf(word) != -1) continue;
            runGetTerminalPosition(buffer, word, i, false);
        }
    }
}