Loading native/src/bigram_dictionary.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -28,7 +28,7 @@ namespace latinime { BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary) : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { Loading native/src/binary_format.h +42 −5 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #ifndef LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H #include <limits> #include "unigram_dictionary.h" namespace latinime { Loading @@ -29,10 +30,18 @@ class BinaryFormat { public: const static int UNKNOWN_FORMAT = -1; const static int FORMAT_VERSION_1 = 1; const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; // Originally, format version 1 had a 16-bit magic number, then the version number `01' // then options that must be 0. Hence the first 32-bits of the format are always as follow // and it's okay to consider them a magic number as a whole. const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; // The versions of Latin IME that only handle format version 1 only test for the magic // number, so we had to change it so that version 2 files would be rejected by older // implementations. On this occasion, we made the magic number 32 bits long. const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; static int detectFormat(const uint8_t* const dict); static unsigned int getHeaderSize(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); Loading @@ -55,10 +64,38 @@ class BinaryFormat { }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; // The magic number is stored big-endian. const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; switch (magicNumber) { case FORMAT_VERSION_1_MAGIC_NUMBER: // Format 1 header is exactly 5 bytes long and looks like: // Magic number (2 bytes) 0x78 0xB1 // Version number (1 byte) 0x01 // Options (2 bytes) must be 0x00 0x00 return 1; case FORMAT_VERSION_2_MAGIC_NUMBER: // Format 2 header is as follows: // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE // Version number (2 bytes) 0x00 0x02 // Options (2 bytes) must be 0x00 0x00 // Header size (4 bytes) : integer, big endian return (dict[4] << 8) + dict[5]; default: return UNKNOWN_FORMAT; } } inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { switch (detectFormat(dict)) { case 1: return FORMAT_VERSION_1_HEADER_SIZE; case 2: // See the format of the header in the comment in detectFormat() above return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; default: return std::numeric_limits<unsigned int>::max(); } } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { const int msb = dict[(*pos)++]; Loading native/src/defines.h +0 −3 Original line number Diff line number Diff line Loading @@ -162,9 +162,6 @@ static inline void prof_out(void) { #define FLAG_BIGRAM_FREQ 0x7F #define DICTIONARY_VERSION_MIN 200 // TODO: remove this constant when the switch to the new dict format is over #define DICTIONARY_HEADER_SIZE 2 #define NEW_DICTIONARY_HEADER_SIZE 5 #define NOT_VALID_WORD -99 #define NOT_A_CHARACTER -1 #define NOT_A_DISTANCE -1 Loading native/src/dictionary.cpp +6 −8 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #define LOG_TAG "LatinIME: dictionary.cpp" #include "binary_format.h" #include "dictionary.h" namespace latinime { Loading @@ -41,10 +42,11 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); mWordsPriorityQueuePool = new WordsPriorityQueuePool( maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives, IS_LATEST_DICT_VERSION, hasBigram(), this); const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict); mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, maxAlternatives, IS_LATEST_DICT_VERSION, true /* hasBigram */, this); } Dictionary::~Dictionary() { Loading @@ -54,10 +56,6 @@ Dictionary::~Dictionary() { delete mBigramDictionary; } bool Dictionary::hasBigram() { return ((mDict[1] & 0xFF) == 1); } bool Dictionary::isValidWord(unsigned short *word, int length) { return mUnigramDictionary->isValidWord(word, length); } Loading native/src/unigram_dictionary.cpp +1 −2 Original line number Diff line number Diff line Loading @@ -38,8 +38,7 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), // TODO : remove this variable. Loading Loading
native/src/bigram_dictionary.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -28,7 +28,7 @@ namespace latinime { BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary) : DICT(dict + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { Loading
native/src/binary_format.h +42 −5 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #ifndef LATINIME_BINARY_FORMAT_H #define LATINIME_BINARY_FORMAT_H #include <limits> #include "unigram_dictionary.h" namespace latinime { Loading @@ -29,10 +30,18 @@ class BinaryFormat { public: const static int UNKNOWN_FORMAT = -1; const static int FORMAT_VERSION_1 = 1; const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; // Originally, format version 1 had a 16-bit magic number, then the version number `01' // then options that must be 0. Hence the first 32-bits of the format are always as follow // and it's okay to consider them a magic number as a whole. const static uint32_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B10100; const static unsigned int FORMAT_VERSION_1_HEADER_SIZE = 5; // The versions of Latin IME that only handle format version 1 only test for the magic // number, so we had to change it so that version 2 files would be rejected by older // implementations. On this occasion, we made the magic number 32 bits long. const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; static int detectFormat(const uint8_t* const dict); static unsigned int getHeaderSize(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); static uint8_t getFlagsAndForwardPointer(const uint8_t* const dict, int* pos); static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); Loading @@ -55,10 +64,38 @@ class BinaryFormat { }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { const uint16_t magicNumber = (dict[0] << 8) + dict[1]; // big endian if (FORMAT_VERSION_1_MAGIC_NUMBER == magicNumber) return FORMAT_VERSION_1; // The magic number is stored big-endian. const uint32_t magicNumber = (dict[0] << 24) + (dict[1] << 16) + (dict[2] << 8) + dict[3]; switch (magicNumber) { case FORMAT_VERSION_1_MAGIC_NUMBER: // Format 1 header is exactly 5 bytes long and looks like: // Magic number (2 bytes) 0x78 0xB1 // Version number (1 byte) 0x01 // Options (2 bytes) must be 0x00 0x00 return 1; case FORMAT_VERSION_2_MAGIC_NUMBER: // Format 2 header is as follows: // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE // Version number (2 bytes) 0x00 0x02 // Options (2 bytes) must be 0x00 0x00 // Header size (4 bytes) : integer, big endian return (dict[4] << 8) + dict[5]; default: return UNKNOWN_FORMAT; } } inline unsigned int BinaryFormat::getHeaderSize(const uint8_t* const dict) { switch (detectFormat(dict)) { case 1: return FORMAT_VERSION_1_HEADER_SIZE; case 2: // See the format of the header in the comment in detectFormat() above return (dict[8] << 24) + (dict[9] << 16) + (dict[10] << 8) + dict[11]; default: return std::numeric_limits<unsigned int>::max(); } } inline int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos) { const int msb = dict[(*pos)++]; Loading
native/src/defines.h +0 −3 Original line number Diff line number Diff line Loading @@ -162,9 +162,6 @@ static inline void prof_out(void) { #define FLAG_BIGRAM_FREQ 0x7F #define DICTIONARY_VERSION_MIN 200 // TODO: remove this constant when the switch to the new dict format is over #define DICTIONARY_HEADER_SIZE 2 #define NEW_DICTIONARY_HEADER_SIZE 5 #define NOT_VALID_WORD -99 #define NOT_A_CHARACTER -1 #define NOT_A_DISTANCE -1 Loading
native/src/dictionary.cpp +6 −8 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #define LOG_TAG "LatinIME: dictionary.cpp" #include "binary_format.h" #include "dictionary.h" namespace latinime { Loading @@ -41,10 +42,11 @@ Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, mCorrection = new Correction(typedLetterMultiplier, fullWordMultiplier); mWordsPriorityQueuePool = new WordsPriorityQueuePool( maxWords, SUB_QUEUE_MAX_WORDS, maxWordLength); mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives, IS_LATEST_DICT_VERSION, hasBigram(), this); const unsigned int headerSize = BinaryFormat::getHeaderSize(mDict); mUnigramDictionary = new UnigramDictionary(mDict + headerSize, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); mBigramDictionary = new BigramDictionary(mDict + headerSize, maxWordLength, maxAlternatives, IS_LATEST_DICT_VERSION, true /* hasBigram */, this); } Dictionary::~Dictionary() { Loading @@ -54,10 +56,6 @@ Dictionary::~Dictionary() { delete mBigramDictionary; } bool Dictionary::hasBigram() { return ((mDict[1] & 0xFF) == 1); } bool Dictionary::isValidWord(unsigned short *word, int length) { return mUnigramDictionary->isValidWord(word, length); } Loading
native/src/unigram_dictionary.cpp +1 −2 Original line number Diff line number Diff line Loading @@ -38,8 +38,7 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] = UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion) : DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), : DICT_ROOT(streamStart), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion), TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier), // TODO : remove this variable. Loading