Add a variable-length header region to the binary format. (46a1eec4) · Commits · e / os / android_packages_inputmethods_LatinIME

native/src/bigram_dictionary.cpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -28,7 +28,7 @@ namespace latinime {
		BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
		int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram,
		Dictionary *parentDictionary)
		: DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength),
		: DICT(dict), MAX_WORD_LENGTH(maxWordLength),
		MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
		HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
		if (DEBUG_DICT) {

native/src/binary_format.h

+42 −5

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@
		#ifndef LATINIME_BINARY_FORMAT_H
		#define LATINIME_BINARY_FORMAT_H

		#include <limits>
		#include "unigram_dictionary.h"

		namespace latinime {
		@@ -29,10 +30,18 @@ class BinaryFormat {

		public:
		const static int UNKNOWN_FORMAT = -1;
		const static int FORMAT_VERSION_1 = 1;
		const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1;
		// Originally, format version 1 had a 16-bit magic number, then the version number `01'
		// then options that must be 0. Hence the first 32-bits of the format are always as follow
		// and it's okay to consider them a magic number as a whole.
		const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100;
		const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5;
		// The versions of Latin IME that only handle format version 1 only test for the magic
		// number, so we had to change it so that version 2 files would be rejected by older
		// implementations. On this occasion, we made the magic number 32 bits long.
		const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;

		static int detectFormat(const uint8_t* const dict);
		static unsigned int getHeaderSize(const uint8_t* const dict);
		static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
		static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos);
		static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
		@@ -55,10 +64,38 @@ class BinaryFormat {
		};

		inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
		const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian
		if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1;
		// The magic number is stored big-endian.
		const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3];
		switch (magicNumber) {
		case FORMAT_VERSION_1_MAGIC_NUMBER:
		// Format 1 header is exactly 5 bytes long and looks like:
		// Magic number (2 bytes) 0x78 0xB1
		// Version number (1 byte) 0x01
		// Options (2 bytes) must be 0x00 0x00
		return 1;
		case FORMAT_VERSION_2_MAGIC_NUMBER:
		// Format 2 header is as follows:
		// Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE
		// Version number (2 bytes) 0x00 0x02
		// Options (2 bytes) must be 0x00 0x00
		// Header size (4 bytes) : integer, big endian
		return (dict[4] << 8) + dict[5];
		default:
		return UNKNOWN_FORMAT;
		}
		}

		inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) {
		switch (detectFormat(dict)) {
		case 1:
		return FORMAT_VERSION_1_HEADER_SIZE;
		case 2:
		// See the format of the header in the comment in detectFormat() above
		return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11];
		default:
		return std::numeric_limits<unsigned int>::max();
		}
		}

		inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) {
		const int msb = dict[(*pos)++];

native/src/defines.h

+0 −3

Original line number	Diff line number	Diff line
		@@ -162,9 +162,6 @@ static inline void prof_out(void) {
		#define FLAG_BIGRAM_FREQ 0x7F

		#define DICTIONARY_VERSION_MIN 200
		// TODO: remove this constant when the switch to the new dict format is over
		#define DICTIONARY_HEADER_SIZE 2
		#define NEW_DICTIONARY_HEADER_SIZE 5
		#define NOT_VALID_WORD -99
		#define NOT_A_CHARACTER -1
		#define NOT_A_DISTANCE -1

native/src/dictionary.cpp

+6 −8

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@

		#define LOG_TAG "LatinIME: dictionary.cpp"

		#include "binary_format.h"
		#include "dictionary.h"

		namespace latinime {
		@@ -41,10 +42,11 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
		mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier);
		mWordsPriorityQueuePool = new WordsPriorityQueuePool(
		maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength);
		mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier,
		maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
		mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives,
		IS_LATEST_DICT_VERSION, hasBigram(), this);
		const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict);
		mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier,
		fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
		mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, maxAlternatives,
		IS_LATEST_DICT_VERSION, true /* hasBigram */, this);
		}

		Dictionary::~Dictionary() {
		@@ -54,10 +56,6 @@ Dictionary::~Dictionary() {
		delete mBigramDictionary;
		}

		bool Dictionary::hasBigram() {
		return ((mDict[1] & 0xFF) == 1);
		}

		bool Dictionary::isValidWord(unsigned short *word, int length) {
		return mUnigramDictionary->isValidWord(word, length);
		}

native/src/unigram_dictionary.cpp

+1 −2

Original line number	Diff line number	Diff line
		@@ -38,8 +38,7 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
		UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
		int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
		const bool isLatestDictVersion)
		: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
		MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
		: DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
		MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
		TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
		// TODO : remove this variable.