Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 25f47828 authored by Ken Wakasa's avatar Ken Wakasa Committed by Android (Google) Code Review
Browse files

Merge "Separate bigram entries from the trie file."

parents d40a931a fd46e87d
Loading
Loading
Loading
Loading
+8 −6
Original line number Diff line number Diff line
@@ -385,6 +385,7 @@ public class BinaryDictEncoderUtils {
                        nodeSize + size, ptNode.mChildren));
            }
            nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
            if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
                if (null != ptNode.mBigrams) {
                    for (WeightedString bigram : ptNode.mBigrams) {
                        final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
@@ -393,6 +394,7 @@ public class BinaryDictEncoderUtils {
                        nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
                    }
                }
            }
            ptNode.mCachedSize = nodeSize;
            size += nodeSize;
        }
+4 −0
Original line number Diff line number Diff line
@@ -265,8 +265,12 @@ public final class FormatSpec {
    static final String FREQ_FILE_EXTENSION = ".freq";
    // tat = Terminal Address Table
    static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat";
    static final String BIGRAM_FILE_EXTENSION = ".bigram";
    static final String BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup";
    static final String BIGRAM_ADDRESS_TABLE_FILE_EXTENSION = ".bigram_index";
    static final int FREQUENCY_AND_FLAGS_SIZE = 2;
    static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
    static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;

    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
    static final int NO_PARENT_ADDRESS = 0;
+44 −0
Original line number Diff line number Diff line
@@ -18,6 +18,9 @@ package com.android.inputmethod.latin.makedict;

import com.android.inputmethod.annotations.UsedForTesting;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
@@ -147,4 +150,45 @@ public class SparseTable {
            BinaryDictEncoderUtils.writeUIntToStream(contentOutStream, index, 4);
        }
    }

    @UsedForTesting
    public void writeToFiles(final File lookupTableFile, final File contentFile)
            throws IOException {
      FileOutputStream lookupTableOutStream = null;
      FileOutputStream contentOutStream = null;
        try {
            lookupTableOutStream = new FileOutputStream(lookupTableFile);
            contentOutStream = new FileOutputStream(contentFile);
            write(lookupTableOutStream, contentOutStream);
        } finally {
            if (lookupTableOutStream != null) {
                lookupTableOutStream.close();
            }
            if (contentOutStream != null) {
                contentOutStream.close();
            }
        }
    }

    private static byte[] readFileToByteArray(final File file) throws IOException {
        final byte[] contents = new byte[(int) file.length()];
        FileInputStream inStream = null;
        try {
            inStream = new FileInputStream(file);
            inStream.read(contents);
        } finally {
            if (inStream != null) {
                inStream.close();
            }
        }
        return contents;
    }

    @UsedForTesting
    public static SparseTable readFromFiles(final File lookupTableFile, final File contentFile,
            final int blockSize) throws IOException {
        final byte[] lookupTable = readFileToByteArray(lookupTableFile);
        final byte[] content = readFileToByteArray(contentFile);
        return new SparseTable(lookupTable, content, blockSize);
    }
}
+32 −2
Original line number Diff line number Diff line
@@ -42,12 +42,15 @@ public class Ver4DictDecoder extends DictDecoder {
    private static final int FILETYPE_TRIE = 1;
    private static final int FILETYPE_FREQUENCY = 2;
    private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3;
    private static final int FILETYPE_BIGRAM = 4;

    private final File mDictDirectory;
    private final DictionaryBufferFactory mBufferFactory;
    private DictBuffer mDictBuffer;
    private DictBuffer mFrequencyBuffer;
    private DictBuffer mTerminalAddressTableBuffer;
    private DictBuffer mBigramBuffer;
    private SparseTable mBigramAddressTable;

    @UsedForTesting
    /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) {
@@ -82,6 +85,9 @@ public class Ver4DictDecoder extends DictDecoder {
        } else if (fileType == FILETYPE_TERMINAL_ADDRESS_TABLE) {
            return new File(mDictDirectory,
                    mDictDirectory.getName() + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
        } else if (fileType == FILETYPE_BIGRAM) {
            return new File(mDictDirectory,
                    mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION);
        } else {
            throw new RuntimeException("Unsupported kind of file : " + fileType);
        }
@@ -94,6 +100,8 @@ public class Ver4DictDecoder extends DictDecoder {
        mFrequencyBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_FREQUENCY));
        mTerminalAddressTableBuffer = mBufferFactory.getDictionaryBuffer(
                getFile(FILETYPE_TERMINAL_ADDRESS_TABLE));
        mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM));
        loadBigramAddressSparseTable();
    }

    @Override
@@ -118,6 +126,15 @@ public class Ver4DictDecoder extends DictDecoder {
        return header;
    }

    private void loadBigramAddressSparseTable() throws IOException {
        final File lookupIndexFile = new File(mDictDirectory,
                mDictDirectory.getName() + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
        final File contentFile = new File(mDictDirectory,
                mDictDirectory.getName() + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
        mBigramAddressTable = SparseTable.readFromFiles(lookupIndexFile, contentFile,
                FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
    }

    protected static class PtNodeReader extends DictDecoder.PtNodeReader {
        protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) {
            frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1);
@@ -191,8 +208,21 @@ public class Ver4DictDecoder extends DictDecoder {
        final ArrayList<PendingAttribute> bigrams;
        if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
            bigrams = new ArrayList<PendingAttribute>();
            addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
                    addressPointer);
            final int posOfBigrams = mBigramAddressTable.get(terminalId);
            mBigramBuffer.position(posOfBigrams);
            while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
                // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE,
                // remaining bigram entries are ignored.
                final int bigramFlags = mBigramBuffer.readUnsignedByte();
                final int targetTerminalId = mBigramBuffer.readUnsignedInt24();
                mTerminalAddressTableBuffer.position(
                        targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE);
                final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24();
                bigrams.add(new PendingAttribute(
                        bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
                        targetAddress));
                if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
            }
            if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
                MakedictLog.d("too many bigrams in a node.");
            }
+55 −23
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
@@ -43,9 +44,13 @@ public class Ver4DictEncoder implements DictEncoder {
    private byte[] mTrieBuf;
    private int mTriePos;
    private int mHeaderSize;
    private SparseTable mBigramAddressTable;
    private OutputStream mTrieOutStream;
    private OutputStream mFreqOutStream;
    private OutputStream mTerminalAddressTableOutStream;
    private OutputStream mBigramOutStream;
    private File mDictDir;
    private String mBaseFilename;

    @UsedForTesting
    public Ver4DictEncoder(final File dictPlacedDir) {
@@ -55,12 +60,14 @@ public class Ver4DictEncoder implements DictEncoder {
    private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions)
            throws FileNotFoundException, IOException {
        final FileHeader header = new FileHeader(0, dictOptions, formatOptions);
        final String filename = header.getId() + "." + header.getVersion();
        final File mDictDir = new File(mDictPlacedDir, filename);
        final File trieFile = new File(mDictDir, filename + FormatSpec.TRIE_FILE_EXTENSION);
        final File freqFile = new File(mDictDir, filename + FormatSpec.FREQ_FILE_EXTENSION);
        mBaseFilename = header.getId() + "." + header.getVersion();
        mDictDir = new File(mDictPlacedDir, mBaseFilename);
        final File trieFile = new File(mDictDir, mBaseFilename + FormatSpec.TRIE_FILE_EXTENSION);
        final File freqFile = new File(mDictDir, mBaseFilename + FormatSpec.FREQ_FILE_EXTENSION);
        final File terminalAddressTableFile = new File(mDictDir,
                filename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
                mBaseFilename + FormatSpec.TERMINAL_ADDRESS_TABLE_FILE_EXTENSION);
        final File bigramFile = new File(mDictDir,
                mBaseFilename + FormatSpec.BIGRAM_FILE_EXTENSION);
        if (!mDictDir.isDirectory()) {
            if (mDictDir.exists()) mDictDir.delete();
            mDictDir.mkdirs();
@@ -71,6 +78,7 @@ public class Ver4DictEncoder implements DictEncoder {
        mTrieOutStream = new FileOutputStream(trieFile);
        mFreqOutStream = new FileOutputStream(freqFile);
        mTerminalAddressTableOutStream = new FileOutputStream(terminalAddressTableFile);
        mBigramOutStream = new FileOutputStream(bigramFile);
    }

    private void close() throws IOException {
@@ -84,10 +92,14 @@ public class Ver4DictEncoder implements DictEncoder {
            if (mTerminalAddressTableOutStream != null) {
                mTerminalAddressTableOutStream.close();
            }
            if (mBigramOutStream != null) {
                mBigramOutStream.close();
            }
        } finally {
            mTrieOutStream = null;
            mFreqOutStream = null;
            mTerminalAddressTableOutStream = null;
            mBigramOutStream = null;
        }
    }

@@ -123,6 +135,10 @@ public class Ver4DictEncoder implements DictEncoder {
        if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);

        writeTerminalData(flatNodes, terminalCount);
        mBigramAddressTable = new SparseTable(terminalCount,
                FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE);
        writeBigrams(flatNodes, dict);
        writeBigramAddressSparseTable();

        final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
        final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
@@ -230,25 +246,42 @@ public class Ver4DictEncoder implements DictEncoder {
                shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE);
    }

    private void writeBigrams(ArrayList<WeightedString> bigrams, FusionDictionary dict) {
        if (bigrams == null) return;
    private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict)
            throws IOException {
        final ByteArrayOutputStream bigramBuffer = new ByteArrayOutputStream();

        final Iterator<WeightedString> bigramIterator = bigrams.iterator();
        for (final PtNodeArray nodeArray : flatNodes) {
            for (final PtNode ptNode : nodeArray.mData) {
                if (ptNode.mBigrams != null) {
                    final int startPos = bigramBuffer.size();
                    mBigramAddressTable.set(ptNode.mTerminalId, startPos);
                    final Iterator<WeightedString> bigramIterator = ptNode.mBigrams.iterator();
                    while (bigramIterator.hasNext()) {
                        final WeightedString bigram = bigramIterator.next();
                        final PtNode target =
                            FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
            final int addressOfBigram = target.mCachedAddressAfterUpdate;
                        final int unigramFrequencyForThisWord = target.mFrequency;
            final int offset = addressOfBigram
                    - (mTriePos + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
            int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
                    offset, bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
            mTrieBuf[mTriePos++] = (byte) bigramFlags;
            mTriePos += BinaryDictEncoderUtils.writeChildrenPosition(mTrieBuf,
                    mTriePos, Math.abs(offset));
                        final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(
                                bigramIterator.hasNext(), 0, bigram.mFrequency,
                                unigramFrequencyForThisWord, bigram.mWord);
                        BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, bigramFlags,
                                FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
                        BinaryDictEncoderUtils.writeUIntToStream(bigramBuffer, target.mTerminalId,
                                FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE);
                    }
                }
            }
        }
        bigramBuffer.writeTo(mBigramOutStream);
    }

    private void writeBigramAddressSparseTable() throws IOException {
        final File lookupIndexFile =
                new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_LOOKUP_TABLE_FILE_EXTENSION);
        final File contentFile =
                new File(mDictDir, mBaseFilename + FormatSpec.BIGRAM_ADDRESS_TABLE_FILE_EXTENSION);
        mBigramAddressTable.writeToFiles(lookupIndexFile, contentFile);
    }

    @Override
    public void writeForwardLinkAddress(int forwardLinkAddress) {
@@ -267,7 +300,6 @@ public class Ver4DictEncoder implements DictEncoder {
        }
        writeChildrenPosition(ptNode, formatOptions);
        writeShortcuts(ptNode.mShortcutTargets);
        writeBigrams(ptNode.mBigrams, dict);
    }

    private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes,