Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5cc59616 authored by Jean Chalard's avatar Jean Chalard Committed by Android (Google) Code Review
Browse files

Merge "Use ByteBuffer when reading FusionDictionary from file." into jb-mr1-dev

parents 687a2447 d4fe7fda
Loading
Loading
Loading
Loading
+21 −6
Original line number Diff line number Diff line
@@ -25,7 +25,10 @@ import android.content.res.AssetFileDescriptor;
import android.util.Log;

import java.io.File;
import java.io.RandomAccessFile;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Locale;
@@ -349,17 +352,21 @@ class BinaryDictionaryGetter {
        // ad-hock ## HACK ##
        if (!Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) return true;

        FileInputStream inStream = null;
        try {
            // Read the version of the file
            final RandomAccessFile raf = new RandomAccessFile(f, "r");
            final int magic = raf.readInt();
            inStream = new FileInputStream(f);
            final ByteBuffer buffer = inStream.getChannel().map(
                    FileChannel.MapMode.READ_ONLY, 0, f.length());
            final int magic = buffer.getInt();
            if (magic != BinaryDictInputOutput.VERSION_2_MAGIC_NUMBER) {
                return false;
            }
            final int formatVersion = raf.readInt();
            final int headerSize = raf.readInt();
            final int formatVersion = buffer.getInt();
            final int headerSize = buffer.getInt();
            final HashMap<String, String> options = CollectionUtils.newHashMap();
            BinaryDictInputOutput.populateOptionsFromFile(raf, headerSize, options);
            BinaryDictInputOutput.populateOptions(buffer, headerSize, options);

            final String version = options.get(VERSION_KEY);
            if (null == version) {
                // No version in the options : the format is unexpected
@@ -374,6 +381,14 @@ class BinaryDictionaryGetter {
            return false;
        } catch (NumberFormatException e) {
            return false;
        } finally {
            if (inStream != null) {
                try {
                    inStream.close();
                } catch (IOException e) {
                    // do nothing
                }
            }
        }
    }

+134 −89
Original line number Diff line number Diff line
@@ -22,10 +22,13 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -307,33 +310,32 @@ public class BinaryDictInputOutput {
        }

        /**
         * Reads a string from a RandomAccessFile. This is the converse of the above method.
         * Reads a string from a ByteBuffer. This is the converse of the above method.
         */
        private static String readString(final RandomAccessFile source) throws IOException {
        private static String readString(final ByteBuffer buffer) {
            final StringBuilder s = new StringBuilder();
            int character = readChar(source);
            int character = readChar(buffer);
            while (character != INVALID_CHARACTER) {
                s.appendCodePoint(character);
                character = readChar(source);
                character = readChar(buffer);
            }
            return s.toString();
        }

        /**
         * Reads a character from the file.
         * Reads a character from the ByteBuffer.
         *
         * This follows the character format documented earlier in this source file.
         *
         * @param source the file, positioned over an encoded character.
         * @param buffer the buffer, positioned over an encoded character.
         * @return the character code.
         */
        private static int readChar(RandomAccessFile source) throws IOException {
            int character = source.readUnsignedByte();
        private static int readChar(final ByteBuffer buffer) {
            int character = readUnsignedByte(buffer);
            if (!fitsOnOneByte(character)) {
                if (GROUP_CHARACTERS_TERMINATOR == character)
                    return INVALID_CHARACTER;
                if (GROUP_CHARACTERS_TERMINATOR == character) return INVALID_CHARACTER;
                character <<= 16;
                character += source.readUnsignedShort();
                character += readUnsignedShort(buffer);
            }
            return character;
        }
@@ -1091,46 +1093,46 @@ public class BinaryDictInputOutput {
    // readDictionaryBinary is the public entry point for them.

    static final int[] characterBuffer = new int[MAX_WORD_LENGTH];
    private static CharGroupInfo readCharGroup(RandomAccessFile source,
            final int originalGroupAddress) throws IOException {
    private static CharGroupInfo readCharGroup(final ByteBuffer buffer,
            final int originalGroupAddress) {
        int addressPointer = originalGroupAddress;
        final int flags = source.readUnsignedByte();
        final int flags = readUnsignedByte(buffer);
        ++addressPointer;
        final int characters[];
        if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) {
            int index = 0;
            int character = CharEncoding.readChar(source);
            int character = CharEncoding.readChar(buffer);
            addressPointer += CharEncoding.getCharSize(character);
            while (-1 != character) {
                characterBuffer[index++] = character;
                character = CharEncoding.readChar(source);
                character = CharEncoding.readChar(buffer);
                addressPointer += CharEncoding.getCharSize(character);
            }
            characters = Arrays.copyOfRange(characterBuffer, 0, index);
        } else {
            final int character = CharEncoding.readChar(source);
            final int character = CharEncoding.readChar(buffer);
            addressPointer += CharEncoding.getCharSize(character);
            characters = new int[] { character };
        }
        final int frequency;
        if (0 != (FLAG_IS_TERMINAL & flags)) {
            ++addressPointer;
            frequency = source.readUnsignedByte();
            frequency = readUnsignedByte(buffer);
        } else {
            frequency = CharGroup.NOT_A_TERMINAL;
        }
        int childrenAddress = addressPointer;
        switch (flags & MASK_GROUP_ADDRESS_TYPE) {
        case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
            childrenAddress += source.readUnsignedByte();
            childrenAddress += readUnsignedByte(buffer);
            addressPointer += 1;
            break;
        case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
            childrenAddress += source.readUnsignedShort();
            childrenAddress += readUnsignedShort(buffer);
            addressPointer += 2;
            break;
        case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
            childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort();
            childrenAddress += readUnsignedInt24(buffer);
            addressPointer += 3;
            break;
        case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
@@ -1140,38 +1142,38 @@ public class BinaryDictInputOutput {
        }
        ArrayList<WeightedString> shortcutTargets = null;
        if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) {
            final long pointerBefore = source.getFilePointer();
            final int pointerBefore = buffer.position();
            shortcutTargets = new ArrayList<WeightedString>();
            source.readUnsignedShort(); // Skip the size
            buffer.getShort(); // Skip the size
            while (true) {
                final int targetFlags = source.readUnsignedByte();
                final String word = CharEncoding.readString(source);
                final int targetFlags = readUnsignedByte(buffer);
                final String word = CharEncoding.readString(buffer);
                shortcutTargets.add(new WeightedString(word,
                        targetFlags & FLAG_ATTRIBUTE_FREQUENCY));
                if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
            }
            addressPointer += (source.getFilePointer() - pointerBefore);
            addressPointer += buffer.position() - pointerBefore;
        }
        ArrayList<PendingAttribute> bigrams = null;
        if (0 != (flags & FLAG_HAS_BIGRAMS)) {
            bigrams = new ArrayList<PendingAttribute>();
            while (true) {
                final int bigramFlags = source.readUnsignedByte();
                final int bigramFlags = readUnsignedByte(buffer);
                ++addressPointer;
                final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
                int bigramAddress = addressPointer;
                switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
                case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
                    bigramAddress += sign * source.readUnsignedByte();
                    bigramAddress += sign * readUnsignedByte(buffer);
                    addressPointer += 1;
                    break;
                case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
                    bigramAddress += sign * source.readUnsignedShort();
                    bigramAddress += sign * readUnsignedShort(buffer);
                    addressPointer += 2;
                    break;
                case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
                    final int offset = ((source.readUnsignedByte() << 16)
                            + source.readUnsignedShort());
                    final int offset = (readUnsignedByte(buffer) << 16)
                            + readUnsignedShort(buffer);
                    bigramAddress += sign * offset;
                    addressPointer += 3;
                    break;
@@ -1188,15 +1190,15 @@ public class BinaryDictInputOutput {
    }

    /**
     * Reads and returns the char group count out of a file and forwards the pointer.
     * Reads and returns the char group count out of a buffer and forwards the pointer.
     */
    private static int readCharGroupCount(RandomAccessFile source) throws IOException {
        final int msb = source.readUnsignedByte();
    private static int readCharGroupCount(final ByteBuffer buffer) {
        final int msb = readUnsignedByte(buffer);
        if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
            return msb;
        } else {
            return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
                    + source.readUnsignedByte();
                    + readUnsignedByte(buffer);
        }
    }

@@ -1204,31 +1206,29 @@ public class BinaryDictInputOutput {
    // of this method. Since it performs direct, unbuffered random access to the file and
    // may be called hundreds of thousands of times, the resulting performance is not
    // reasonable without some kind of cache. Thus:
    // TODO: perform buffered I/O here and in other places in the code.
    private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
    /**
     * Finds, as a string, the word at the address passed as an argument.
     *
     * @param source the file to read from.
     * @param buffer the buffer to read from.
     * @param headerSize the size of the header.
     * @param address the address to seek.
     * @return the word, as a string.
     * @throws IOException if the file can't be read.
     */
    private static String getWordAtAddress(final RandomAccessFile source, final long headerSize,
            int address) throws IOException {
    private static String getWordAtAddress(final ByteBuffer buffer, final int headerSize,
            final int address) {
        final String cachedString = wordCache.get(address);
        if (null != cachedString) return cachedString;
        final long originalPointer = source.getFilePointer();
        source.seek(headerSize);
        final int count = readCharGroupCount(source);
        final int originalPointer = buffer.position();
        buffer.position(headerSize);
        final int count = readCharGroupCount(buffer);
        int groupOffset = getGroupCountSize(count);
        final StringBuilder builder = new StringBuilder();
        String result = null;

        CharGroupInfo last = null;
        for (int i = count - 1; i >= 0; --i) {
            CharGroupInfo info = readCharGroup(source, groupOffset);
            CharGroupInfo info = readCharGroup(buffer, groupOffset);
            groupOffset = info.mEndAddress;
            if (info.mOriginalAddress == address) {
                builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
@@ -1239,9 +1239,9 @@ public class BinaryDictInputOutput {
                if (info.mChildrenAddress > address) {
                    if (null == last) continue;
                    builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
                    source.seek(last.mChildrenAddress + headerSize);
                    buffer.position(last.mChildrenAddress + headerSize);
                    groupOffset = last.mChildrenAddress + 1;
                    i = source.readUnsignedByte();
                    i = readUnsignedByte(buffer);
                    last = null;
                    continue;
                }
@@ -1249,14 +1249,14 @@ public class BinaryDictInputOutput {
            }
            if (0 == i && hasChildrenAddress(last.mChildrenAddress)) {
                builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
                source.seek(last.mChildrenAddress + headerSize);
                buffer.position(last.mChildrenAddress + headerSize);
                groupOffset = last.mChildrenAddress + 1;
                i = source.readUnsignedByte();
                i = readUnsignedByte(buffer);
                last = null;
                continue;
            }
        }
        source.seek(originalPointer);
        buffer.position(originalPointer);
        wordCache.put(address, result);
        return result;
    }
@@ -1269,44 +1269,47 @@ public class BinaryDictInputOutput {
     * This will recursively read other nodes into the structure, populating the reverse
     * maps on the fly and using them to keep track of already read nodes.
     *
     * @param source the data file, correctly positioned at the start of a node.
     * @param buffer the buffer, correctly positioned at the start of a node.
     * @param headerSize the size, in bytes, of the file header.
     * @param reverseNodeMap a mapping from addresses to already read nodes.
     * @param reverseGroupMap a mapping from addresses to already read character groups.
     * @return the read node with all his children already read.
     */
    private static Node readNode(RandomAccessFile source, long headerSize,
            Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)
    private static Node readNode(final ByteBuffer buffer, final int headerSize,
            final Map<Integer, Node> reverseNodeMap, final Map<Integer, CharGroup> reverseGroupMap)
            throws IOException {
        final int nodeOrigin = (int)(source.getFilePointer() - headerSize);
        final int count = readCharGroupCount(source);
        final int nodeOrigin = buffer.position() - headerSize;
        final int count = readCharGroupCount(buffer);
        final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>();
        int groupOffset = nodeOrigin + getGroupCountSize(count);
        for (int i = count; i > 0; --i) {
            CharGroupInfo info = readCharGroup(source, groupOffset);
            CharGroupInfo info =readCharGroup(buffer, groupOffset);
            ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
            ArrayList<WeightedString> bigrams = null;
            if (null != info.mBigrams) {
                bigrams = new ArrayList<WeightedString>();
                for (PendingAttribute bigram : info.mBigrams) {
                    final String word = getWordAtAddress(source, headerSize, bigram.mAddress);
                    final String word = getWordAtAddress(
                            buffer, headerSize, bigram.mAddress);
                    bigrams.add(new WeightedString(word, bigram.mFrequency));
                }
            }
            if (hasChildrenAddress(info.mChildrenAddress)) {
                Node children = reverseNodeMap.get(info.mChildrenAddress);
                if (null == children) {
                    final long currentPosition = source.getFilePointer();
                    source.seek(info.mChildrenAddress + headerSize);
                    children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap);
                    source.seek(currentPosition);
                    final int currentPosition = buffer.position();
                    buffer.position(info.mChildrenAddress + headerSize);
                    children = readNode(
                            buffer, headerSize, reverseNodeMap, reverseGroupMap);
                    buffer.position(currentPosition);
                }
                nodeContents.add(
                        new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
                                children));
                        new CharGroup(info.mCharacters, shortcutTargets,
                                bigrams, info.mFrequency, children));
            } else {
                nodeContents.add(
                        new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency));
                        new CharGroup(info.mCharacters, shortcutTargets,
                                bigrams, info.mFrequency));
            }
            groupOffset = info.mEndAddress;
        }
@@ -1318,12 +1321,13 @@ public class BinaryDictInputOutput {

    /**
     * Helper function to get the binary format version from the header.
     * @throws IOException
     */
    private static int getFormatVersion(final RandomAccessFile source) throws IOException {
        final int magic_v1 = source.readUnsignedShort();
        if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte();
        final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort();
        if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort();
    private static int getFormatVersion(final ByteBuffer buffer) throws IOException {
        final int magic_v1 = readUnsignedShort(buffer);
        if (VERSION_1_MAGIC_NUMBER == magic_v1) return readUnsignedByte(buffer);
        final int magic_v2 = (magic_v1 << 16) + readUnsignedShort(buffer);
        if (VERSION_2_MAGIC_NUMBER == magic_v2) return readUnsignedShort(buffer);
        return NOT_A_VERSION_NUMBER;
    }

@@ -1333,53 +1337,60 @@ public class BinaryDictInputOutput {
     * The file is read at the current file pointer, so the caller must take care the pointer
     * is in the right place before calling this.
     */
    public static void populateOptionsFromFile(final RandomAccessFile source, final long headerSize,
            final HashMap<String, String> options) throws IOException {
        while (source.getFilePointer() < headerSize) {
            final String key = CharEncoding.readString(source);
            final String value = CharEncoding.readString(source);
    public static void populateOptions(final ByteBuffer buffer, final int headerSize,
            final HashMap<String, String> options) {
        while (buffer.position() < headerSize) {
            final String key = CharEncoding.readString(buffer);
            final String value = CharEncoding.readString(buffer);
            options.put(key, value);
        }
    }

    /**
     * Reads a random access file and returns the memory representation of the dictionary.
     * Reads a byte buffer and returns the memory representation of the dictionary.
     *
     * This high-level method takes a binary file and reads its contents, populating a
     * FusionDictionary structure. The optional dict argument is an existing dictionary to
     * which words from the file should be added. If it is null, a new dictionary is created.
     *
     * @param source the file to read.
     * @param buffer the buffer to read.
     * @param dict an optional dictionary to add words to, or null.
     * @return the created (or merged) dictionary.
     */
    public static FusionDictionary readDictionaryBinary(final RandomAccessFile source,
    public static FusionDictionary readDictionaryBinary(final ByteBuffer buffer,
            final FusionDictionary dict) throws IOException, UnsupportedFormatException {
        // Check file version
        final int version = getFormatVersion(source);
        final int version = getFormatVersion(buffer);
        if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) {
            throw new UnsupportedFormatException("This file has version " + version
                    + ", but this implementation does not support versions above "
                    + MAXIMUM_SUPPORTED_VERSION);
        }

        // clear cache
        wordCache.clear();

        // Read options
        final int optionsFlags = source.readUnsignedShort();
        final int optionsFlags = readUnsignedShort(buffer);

        final long headerSize;
        final int headerSize;
        final HashMap<String, String> options = new HashMap<String, String>();
        if (version < FIRST_VERSION_WITH_HEADER_SIZE) {
            headerSize = source.getFilePointer();
            headerSize = buffer.position();
        } else {
            headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16)
                    + (source.readUnsignedByte() << 8) + source.readUnsignedByte();
            populateOptionsFromFile(source, headerSize, options);
            source.seek(headerSize);
            headerSize = buffer.getInt();
            populateOptions(buffer, headerSize, options);
            buffer.position(headerSize);
        }

        if (headerSize < 0) {
            throw new UnsupportedFormatException("header size can't be negative.");
        }

        Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>();
        Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
        final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping);
        final Node root = readNode(
                buffer, headerSize, reverseNodeMapping, reverseGroupMapping);

        FusionDictionary newDict = new FusionDictionary(root,
                new FusionDictionary.DictionaryOptions(options,
@@ -1402,6 +1413,28 @@ public class BinaryDictInputOutput {
        return newDict;
    }

    /**
     * Helper function to read one byte from ByteBuffer.
     */
    private static int readUnsignedByte(final ByteBuffer buffer) {
        return ((int)buffer.get()) & 0xFF;
    }

    /**
     * Helper function to read two byte from ByteBuffer.
     */
    private static int readUnsignedShort(final ByteBuffer buffer) {
        return ((int)buffer.getShort()) & 0xFFFF;
    }

    /**
     * Helper function to read three byte from ByteBuffer.
     */
    private static int readUnsignedInt24(final ByteBuffer buffer) {
        final int value = readUnsignedByte(buffer) << 16;
        return value + readUnsignedShort(buffer);
    }

    /**
     * Basic test to find out whether the file is a binary dictionary or not.
     *
@@ -1411,14 +1444,26 @@ public class BinaryDictInputOutput {
     * @return true if it's a binary dictionary, false otherwise
     */
    public static boolean isBinaryDictionary(final String filename) {
        FileInputStream inStream = null;
        try {
            RandomAccessFile f = new RandomAccessFile(filename, "r");
            final int version = getFormatVersion(f);
            final File file = new File(filename);
            inStream = new FileInputStream(file);
            final ByteBuffer buffer = inStream.getChannel().map(
                    FileChannel.MapMode.READ_ONLY, 0, file.length());
            final int version = getFormatVersion(buffer);
            return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION);
        } catch (FileNotFoundException e) {
            return false;
        } catch (IOException e) {
            return false;
        } finally {
            if (inStream != null) {
                try {
                    inStream.close();
                } catch (IOException e) {
                    // do nothing
                }
            }
        }
    }