Loading java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +4 −4 Original line number Original line Diff line number Diff line Loading @@ -23,11 +23,11 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.File; import java.io.FileInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.ArrayList; Loading Loading @@ -219,14 +219,14 @@ public final class BinaryDictDecoderUtils { } } /** /** * Writes a string with our character format to a ByteArrayOutputStream. * Writes a string with our character format to an OutputStream. * * * This will also write the terminator byte. * This will also write the terminator byte. * * * @param buffer the ByteArrayOutputStream to write to. * @param buffer the OutputStream to write to. * @param word the string to write. * @param word the string to write. */ */ static void writeString(final ByteArrayOutputStream buffer, final String word) { static void writeString(final OutputStream buffer, final String word) throws IOException { final int length = word.length(); final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); final int codePoint = word.codePointAt(i); Loading java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +1 −1 Original line number Original line Diff line number Diff line Loading @@ -383,8 +383,8 @@ public class BinaryDictEncoderUtils { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize + size, ptNode.mChildren)); nodeSize + size, ptNode.mChildren)); } } nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (null != ptNode.mBigrams) { if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, Loading java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +13 −0 Original line number Original line Diff line number Diff line Loading @@ -266,15 +266,28 @@ public final class FormatSpec { // tat = Terminal Address Table // tat = Terminal Address Table static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String SHORTCUT_FILE_EXTENSION = ".shortcut"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; // With the English main dictionary as of October 2013, the size of bigram address table is // is 584KB with the block size being 4. // This is 91% of that of full address table. static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final int SHORTCUT_CONTENT_COUNT = 1; static final int SHORTCUT_CONTENT_INDEX = 0; // With the English main dictionary as of October 2013, the size of shortcut address table is // 29KB with the block size being 64. // This is only 4.4% of that of full address table. static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; static final String SHORTCUT_CONTENT_ID = "_shortcut"; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; static final int NO_PARENT_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; Loading java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +40 −8 Original line number Original line Diff line number Diff line Loading @@ -23,6 +23,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CollectionUtils; import android.util.Log; import android.util.Log; Loading @@ -43,6 +44,7 @@ public class Ver4DictDecoder extends DictDecoder { private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_BIGRAM_FREQ = 4; private static final int FILETYPE_BIGRAM_FREQ = 4; private static final int FILETYPE_SHORTCUT = 5; private final File mDictDirectory; private final File mDictDirectory; private final DictionaryBufferFactory mBufferFactory; private final DictionaryBufferFactory mBufferFactory; Loading @@ -50,7 +52,9 @@ public class Ver4DictDecoder extends DictDecoder { private DictBuffer mFrequencyBuffer; private DictBuffer mFrequencyBuffer; private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mBigramBuffer; private DictBuffer mBigramBuffer; private DictBuffer mShortcutBuffer; private SparseTable mBigramAddressTable; private SparseTable mBigramAddressTable; private SparseTable mShortcutAddressTable; @UsedForTesting @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { Loading Loading @@ -89,6 +93,10 @@ public class Ver4DictDecoder extends DictDecoder { return new File(mDictDirectory, return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + FormatSpec.BIGRAM_FREQ_CONTENT_ID); } else if (fileType == FILETYPE_SHORTCUT) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.SHORTCUT_CONTENT_ID); } else { } else { throw new RuntimeException("Unsupported kind of file : " + fileType); throw new RuntimeException("Unsupported kind of file : " + fileType); } } Loading @@ -102,6 +110,8 @@ public class Ver4DictDecoder extends DictDecoder { getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); loadBigramAddressSparseTable(); loadBigramAddressSparseTable(); mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); loadShortcutAddressSparseTable(); } } @Override @Override Loading Loading @@ -136,6 +146,18 @@ public class Ver4DictDecoder extends DictDecoder { FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } } // TODO: Let's have something like SparseTableContentsReader in this class. private void loadShortcutAddressSparseTable() throws IOException { final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); final File contentFile = new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + FormatSpec.SHORTCUT_CONTENT_ID); mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); } protected static class PtNodeReader extends DictDecoder.PtNodeReader { protected static class PtNodeReader extends DictDecoder.PtNodeReader { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); Loading @@ -147,6 +169,23 @@ public class Ver4DictDecoder extends DictDecoder { } } } } private ArrayList<WeightedString> readShortcuts(final int terminalId) { if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId); mShortcutBuffer.position(posOfShortcuts); while (true) { final int flags = mShortcutBuffer.readUnsignedByte(); final String word = CharEncoding.readString(mShortcutBuffer); ret.add(new WeightedString(word, flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; } return ret; } // TODO: Make this buffer thread safe. // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; Loading Loading @@ -197,14 +236,7 @@ public class Ver4DictDecoder extends DictDecoder { childrenAddress += addressPointer; childrenAddress += addressPointer; } } addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); final ArrayList<WeightedString> shortcutTargets; final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { // readShortcut will add shortcuts to shortcutTargets. shortcutTargets = new ArrayList<WeightedString>(); addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); } else { shortcutTargets = null; } final ArrayList<PendingAttribute> bigrams; final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { Loading java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +49 −24 Original line number Original line Diff line number Diff line Loading @@ -49,6 +49,7 @@ public class Ver4DictEncoder implements DictEncoder { private File mDictDir; private File mDictDir; private String mBaseFilename; private String mBaseFilename; private BigramContentWriter mBigramWriter; private BigramContentWriter mBigramWriter; private ShortcutContentWriter mShortcutWriter; @UsedForTesting @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { public Ver4DictEncoder(final File dictPlacedDir) { Loading Loading @@ -152,6 +153,39 @@ public class Ver4DictEncoder implements DictEncoder { } } } } private static class ShortcutContentWriter extends SparseTableContentWriter { public ShortcutContentWriter(final String name, final int initialCapacity, final File baseDir) { super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT, initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); } public void writeShortcutForOneWord(final int terminalId, final Iterator<WeightedString> shortcutIterator) throws IOException { write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, new SparseTableContentWriterInterface() { @Override public void write(final OutputStream outStream) throws IOException { writeShortcutForOneWordInternal(outStream, shortcutIterator); } }); } private void writeShortcutForOneWordInternal(final OutputStream outStream, final Iterator<WeightedString> shortcutIterator) throws IOException { while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency); BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags, FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); CharEncoding.writeString(outStream, target.mWord); } } } private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) throws FileNotFoundException, IOException { throws FileNotFoundException, IOException { final FileHeader header = new FileHeader(0, dictOptions, formatOptions); final FileHeader header = new FileHeader(0, dictOptions, formatOptions); Loading Loading @@ -225,6 +259,8 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalData(flatNodes, terminalCount); writeTerminalData(flatNodes, terminalCount); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); writeBigrams(flatNodes, dict); writeBigrams(flatNodes, dict); mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); writeShortcuts(flatNodes); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; Loading Loading @@ -306,29 +342,6 @@ public class Ver4DictEncoder implements DictEncoder { } } } } private void writeShortcuts(ArrayList<WeightedString> shortcuts) { if (null == shortcuts || shortcuts.isEmpty()) return; final int indexOfShortcutByteSize = mTriePos; mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency); mTrieBuf[mTriePos++] = (byte)shortcutFlags; final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, target.mWord); mTriePos += shortcutShift; } final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); } BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); } private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) throws IOException { throws IOException { mBigramWriter.openStreams(); mBigramWriter.openStreams(); Loading @@ -343,6 +356,19 @@ public class Ver4DictEncoder implements DictEncoder { mBigramWriter.closeStreams(); mBigramWriter.closeStreams(); } } private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException { mShortcutWriter.openStreams(); for (final PtNodeArray nodeArray : flatNodes) { for (final PtNode ptNode : nodeArray.mData) { if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) { mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId, ptNode.mShortcutTargets.iterator()); } } } mShortcutWriter.closeStreams(); } @Override @Override public void writeForwardLinkAddress(int forwardLinkAddress) { public void writeForwardLinkAddress(int forwardLinkAddress) { mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, Loading @@ -359,7 +385,6 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalId(ptNode.mTerminalId); writeTerminalId(ptNode.mTerminalId); } } writeChildrenPosition(ptNode, formatOptions); writeChildrenPosition(ptNode, formatOptions); writeShortcuts(ptNode.mShortcutTargets); } } private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, Loading Loading
java/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderUtils.java +4 −4 Original line number Original line Diff line number Diff line Loading @@ -23,11 +23,11 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.File; import java.io.FileInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.ArrayList; Loading Loading @@ -219,14 +219,14 @@ public final class BinaryDictDecoderUtils { } } /** /** * Writes a string with our character format to a ByteArrayOutputStream. * Writes a string with our character format to an OutputStream. * * * This will also write the terminator byte. * This will also write the terminator byte. * * * @param buffer the ByteArrayOutputStream to write to. * @param buffer the OutputStream to write to. * @param word the string to write. * @param word the string to write. */ */ static void writeString(final ByteArrayOutputStream buffer, final String word) { static void writeString(final OutputStream buffer, final String word) throws IOException { final int length = word.length(); final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); final int codePoint = word.codePointAt(i); Loading
java/src/com/android/inputmethod/latin/makedict/BinaryDictEncoderUtils.java +1 −1 Original line number Original line Diff line number Diff line Loading @@ -383,8 +383,8 @@ public class BinaryDictEncoderUtils { nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, nodeSize + size, ptNode.mChildren)); nodeSize + size, ptNode.mChildren)); } } nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) { nodeSize += getShortcutListSize(ptNode.mShortcutTargets); if (null != ptNode.mBigrams) { if (null != ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { for (WeightedString bigram : ptNode.mBigrams) { final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, Loading
java/src/com/android/inputmethod/latin/makedict/FormatSpec.java +13 −0 Original line number Original line Diff line number Diff line Loading @@ -266,15 +266,28 @@ public final class FormatSpec { // tat = Terminal Address Table // tat = Terminal Address Table static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String BIGRAM_FILE_EXTENSION = ".bigram"; static final String SHORTCUT_FILE_EXTENSION = ".shortcut"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; static final String LOOKUP_TABLE_FILE_SUFFIX = "_lookup"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final String CONTENT_TABLE_FILE_SUFFIX = "_index"; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int FREQUENCY_AND_FLAGS_SIZE = 2; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; static final int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; // With the English main dictionary as of October 2013, the size of bigram address table is // is 584KB with the block size being 4. // This is 91% of that of full address table. static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_CONTENT_COUNT = 1; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final int BIGRAM_FREQ_CONTENT_INDEX = 0; static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final String BIGRAM_FREQ_CONTENT_ID = "_freq"; static final int SHORTCUT_CONTENT_COUNT = 1; static final int SHORTCUT_CONTENT_INDEX = 0; // With the English main dictionary as of October 2013, the size of shortcut address table is // 29KB with the block size being 64. // This is only 4.4% of that of full address table. static final int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; static final String SHORTCUT_CONTENT_ID = "_shortcut"; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; static final int NO_PARENT_ADDRESS = 0; static final int NO_PARENT_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; static final int NO_FORWARD_LINK_ADDRESS = 0; Loading
java/src/com/android/inputmethod/latin/makedict/Ver4DictDecoder.java +40 −8 Original line number Original line Diff line number Diff line Loading @@ -23,6 +23,7 @@ import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CollectionUtils; import android.util.Log; import android.util.Log; Loading @@ -43,6 +44,7 @@ public class Ver4DictDecoder extends DictDecoder { private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_FREQUENCY = 2; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_TERMINAL_ADDRESS_TABLE = 3; private static final int FILETYPE_BIGRAM_FREQ = 4; private static final int FILETYPE_BIGRAM_FREQ = 4; private static final int FILETYPE_SHORTCUT = 5; private final File mDictDirectory; private final File mDictDirectory; private final DictionaryBufferFactory mBufferFactory; private final DictionaryBufferFactory mBufferFactory; Loading @@ -50,7 +52,9 @@ public class Ver4DictDecoder extends DictDecoder { private DictBuffer mFrequencyBuffer; private DictBuffer mFrequencyBuffer; private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mTerminalAddressTableBuffer; private DictBuffer mBigramBuffer; private DictBuffer mBigramBuffer; private DictBuffer mShortcutBuffer; private SparseTable mBigramAddressTable; private SparseTable mBigramAddressTable; private SparseTable mShortcutAddressTable; @UsedForTesting @UsedForTesting /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { /* package */ Ver4DictDecoder(final File dictDirectory, final int factoryFlag) { Loading Loading @@ -89,6 +93,10 @@ public class Ver4DictDecoder extends DictDecoder { return new File(mDictDirectory, return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION mDictDirectory.getName() + FormatSpec.BIGRAM_FILE_EXTENSION + FormatSpec.BIGRAM_FREQ_CONTENT_ID); + FormatSpec.BIGRAM_FREQ_CONTENT_ID); } else if (fileType == FILETYPE_SHORTCUT) { return new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.SHORTCUT_CONTENT_ID); } else { } else { throw new RuntimeException("Unsupported kind of file : " + fileType); throw new RuntimeException("Unsupported kind of file : " + fileType); } } Loading @@ -102,6 +110,8 @@ public class Ver4DictDecoder extends DictDecoder { getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); getFile(FILETYPE_TERMINAL_ADDRESS_TABLE)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); mBigramBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_BIGRAM_FREQ)); loadBigramAddressSparseTable(); loadBigramAddressSparseTable(); mShortcutBuffer = mBufferFactory.getDictionaryBuffer(getFile(FILETYPE_SHORTCUT)); loadShortcutAddressSparseTable(); } } @Override @Override Loading Loading @@ -136,6 +146,18 @@ public class Ver4DictDecoder extends DictDecoder { FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); FormatSpec.BIGRAM_ADDRESS_TABLE_BLOCK_SIZE); } } // TODO: Let's have something like SparseTableContentsReader in this class. private void loadShortcutAddressSparseTable() throws IOException { final File lookupIndexFile = new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.LOOKUP_TABLE_FILE_SUFFIX); final File contentFile = new File(mDictDirectory, mDictDirectory.getName() + FormatSpec.SHORTCUT_FILE_EXTENSION + FormatSpec.CONTENT_TABLE_FILE_SUFFIX + FormatSpec.SHORTCUT_CONTENT_ID); mShortcutAddressTable = SparseTable.readFromFiles(lookupIndexFile, new File[] { contentFile }, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE); } protected static class PtNodeReader extends DictDecoder.PtNodeReader { protected static class PtNodeReader extends DictDecoder.PtNodeReader { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { protected static int readFrequency(final DictBuffer frequencyBuffer, final int terminalId) { frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); frequencyBuffer.position(terminalId * FormatSpec.FREQUENCY_AND_FLAGS_SIZE + 1); Loading @@ -147,6 +169,23 @@ public class Ver4DictDecoder extends DictDecoder { } } } } private ArrayList<WeightedString> readShortcuts(final int terminalId) { if (mShortcutAddressTable.get(0, terminalId) == SparseTable.NOT_EXIST) return null; final ArrayList<WeightedString> ret = CollectionUtils.newArrayList(); final int posOfShortcuts = mShortcutAddressTable.get(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId); mShortcutBuffer.position(posOfShortcuts); while (true) { final int flags = mShortcutBuffer.readUnsignedByte(); final String word = CharEncoding.readString(mShortcutBuffer); ret.add(new WeightedString(word, flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY)); if (0 == (flags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; } return ret; } // TODO: Make this buffer thread safe. // TODO: Make this buffer thread safe. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. // TODO: Support words longer than FormatSpec.MAX_WORD_LENGTH. private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH]; Loading Loading @@ -197,14 +236,7 @@ public class Ver4DictDecoder extends DictDecoder { childrenAddress += addressPointer; childrenAddress += addressPointer; } } addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); final ArrayList<WeightedString> shortcutTargets; final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) { // readShortcut will add shortcuts to shortcutTargets. shortcutTargets = new ArrayList<WeightedString>(); addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets); } else { shortcutTargets = null; } final ArrayList<PendingAttribute> bigrams; final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { Loading
java/src/com/android/inputmethod/latin/makedict/Ver4DictEncoder.java +49 −24 Original line number Original line Diff line number Diff line Loading @@ -49,6 +49,7 @@ public class Ver4DictEncoder implements DictEncoder { private File mDictDir; private File mDictDir; private String mBaseFilename; private String mBaseFilename; private BigramContentWriter mBigramWriter; private BigramContentWriter mBigramWriter; private ShortcutContentWriter mShortcutWriter; @UsedForTesting @UsedForTesting public Ver4DictEncoder(final File dictPlacedDir) { public Ver4DictEncoder(final File dictPlacedDir) { Loading Loading @@ -152,6 +153,39 @@ public class Ver4DictEncoder implements DictEncoder { } } } } private static class ShortcutContentWriter extends SparseTableContentWriter { public ShortcutContentWriter(final String name, final int initialCapacity, final File baseDir) { super(name + FormatSpec.SHORTCUT_FILE_EXTENSION, FormatSpec.SHORTCUT_CONTENT_COUNT, initialCapacity, FormatSpec.SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, baseDir, new String[] { name + FormatSpec.SHORTCUT_FILE_EXTENSION }, new String[] { FormatSpec.SHORTCUT_CONTENT_ID }); } public void writeShortcutForOneWord(final int terminalId, final Iterator<WeightedString> shortcutIterator) throws IOException { write(FormatSpec.SHORTCUT_CONTENT_INDEX, terminalId, new SparseTableContentWriterInterface() { @Override public void write(final OutputStream outStream) throws IOException { writeShortcutForOneWordInternal(outStream, shortcutIterator); } }); } private void writeShortcutForOneWordInternal(final OutputStream outStream, final Iterator<WeightedString> shortcutIterator) throws IOException { while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency); BinaryDictEncoderUtils.writeUIntToStream(outStream, shortcutFlags, FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE); CharEncoding.writeString(outStream, target.mWord); } } } private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) private void openStreams(final FormatOptions formatOptions, final DictionaryOptions dictOptions) throws FileNotFoundException, IOException { throws FileNotFoundException, IOException { final FileHeader header = new FileHeader(0, dictOptions, formatOptions); final FileHeader header = new FileHeader(0, dictOptions, formatOptions); Loading Loading @@ -225,6 +259,8 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalData(flatNodes, terminalCount); writeTerminalData(flatNodes, terminalCount); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); mBigramWriter = new BigramContentWriter(mBaseFilename, terminalCount, mDictDir); writeBigrams(flatNodes, dict); writeBigrams(flatNodes, dict); mShortcutWriter = new ShortcutContentWriter(mBaseFilename, terminalCount, mDictDir); writeShortcuts(flatNodes); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1); final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize; Loading Loading @@ -306,29 +342,6 @@ public class Ver4DictEncoder implements DictEncoder { } } } } private void writeShortcuts(ArrayList<WeightedString> shortcuts) { if (null == shortcuts || shortcuts.isEmpty()) return; final int indexOfShortcutByteSize = mTriePos; mTriePos += FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; final Iterator<WeightedString> shortcutIterator = shortcuts.iterator(); while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); final int shortcutFlags = BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency); mTrieBuf[mTriePos++] = (byte)shortcutFlags; final int shortcutShift = CharEncoding.writeString(mTrieBuf, mTriePos, target.mWord); mTriePos += shortcutShift; } final int shortcutByteSize = mTriePos - indexOfShortcutByteSize; if (shortcutByteSize > FormatSpec.MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE) { throw new RuntimeException("Shortcut list too large : " + shortcutByteSize); } BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, indexOfShortcutByteSize, shortcutByteSize, FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE); } private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) private void writeBigrams(final ArrayList<PtNodeArray> flatNodes, final FusionDictionary dict) throws IOException { throws IOException { mBigramWriter.openStreams(); mBigramWriter.openStreams(); Loading @@ -343,6 +356,19 @@ public class Ver4DictEncoder implements DictEncoder { mBigramWriter.closeStreams(); mBigramWriter.closeStreams(); } } private void writeShortcuts(final ArrayList<PtNodeArray> flatNodes) throws IOException { mShortcutWriter.openStreams(); for (final PtNodeArray nodeArray : flatNodes) { for (final PtNode ptNode : nodeArray.mData) { if (ptNode.mShortcutTargets != null && !ptNode.mShortcutTargets.isEmpty()) { mShortcutWriter.writeShortcutForOneWord(ptNode.mTerminalId, ptNode.mShortcutTargets.iterator()); } } } mShortcutWriter.closeStreams(); } @Override @Override public void writeForwardLinkAddress(int forwardLinkAddress) { public void writeForwardLinkAddress(int forwardLinkAddress) { mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, mTriePos = BinaryDictEncoderUtils.writeUIntToBuffer(mTrieBuf, mTriePos, Loading @@ -359,7 +385,6 @@ public class Ver4DictEncoder implements DictEncoder { writeTerminalId(ptNode.mTerminalId); writeTerminalId(ptNode.mTerminalId); } } writeChildrenPosition(ptNode, formatOptions); writeChildrenPosition(ptNode, formatOptions); writeShortcuts(ptNode.mShortcutTargets); } } private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, private void writeTerminalData(final ArrayList<PtNodeArray> flatNodes, Loading