Loading native/jni/Android.mk +1 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,7 @@ LATIN_IME_CORE_SRC_FILES := \ dictionary_structure_with_buffer_policy_factory.cpp \ dynamic_patricia_trie_node_reader.cpp \ dynamic_patricia_trie_policy.cpp \ dynamic_patricia_trie_reading_helper.cpp \ dynamic_patricia_trie_reading_utils.cpp \ patricia_trie_policy.cpp \ patricia_trie_reading_utils.cpp) \ Loading native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +65 −126 Original line number Diff line number Diff line Loading @@ -20,95 +20,68 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { // To avoid infinite loop caused by invalid or malicious forward links. const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); int mergedNodeCodePoints[MAX_WORD_LENGTH]; int nextPos = dicNode->getChildrenPos(); int totalChildCount = 0; do { const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &nextPos); totalChildCount += childCount; if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); return; DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(), nodeReader->getChildrenPos(), nodeReader->getProbability(), nodeReader->isTerminal() && !nodeReader->isDeleted(), nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); readingHelper.readNextSiblingNode(); } for (int i = 0; i < childCount; i++) { nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH, mergedNodeCodePoints); if (!nodeReader.isDeleted()) { // Push child node when the node is not a deleted node. childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(), nodeReader.getChildrenPos(), nodeReader.getProbability(), nodeReader.isTerminal(), nodeReader.hasChildren(), nodeReader.isBlacklisted() || nodeReader.isNotAWord(), nodeReader.getCodePointCount(), mergedNodeCodePoints); } nextPos = nodeReader.getSiblingNodePos(); } nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(mDictRoot, nextPos); } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos)); } int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { if (nodePos == NOT_A_VALID_WORD_POS) { *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. int reverseCodePoints[maxCodePointCount]; int mergedNodeCodePoints[maxCodePointCount]; int codePointCount = 0; DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); // First, read terminal node and get its probability. nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount, mergedNodeCodePoints); DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); // First, read the terminal node and get its probability. readingHelper.initWithNodePos(nodePos); if (!readingHelper.isValidTerminalNode()) { // Node at the nodePos is not a valid terminal node. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Store terminal node probability. *outUnigramProbability = nodeReader.getProbability(); // Store terminal node code points to buffer in the reverse order. for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; } // Then, follow parent pos toward the root node. while (nodeReader.getParentPos() != NOT_A_DICT_POS) { // codePointCount must be incremented at least once in each iteration to ensure preventing // infinite loop. if (nodeReader.isDeleted() || codePointCount > maxCodePointCount || nodeReader.getCodePointCount() <= 0) { *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); // Then, following parent node link to the dictionary root and fetch node code points. while (!readingHelper.isEnd()) { if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { // The nodePos is not a valid terminal node position in the dictionary. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Read parent node. nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(), maxCodePointCount, mergedNodeCodePoints); // Store node code points to buffer in the reverse order. for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; readingHelper.fetchMergedNodeCodePointsInReverseOrder( readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); // Follow parent node toward the root node. readingHelper.readParentNode(); } if (readingHelper.isError()) { // The node position or the dictionary is invalid. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Reverse the stored code points to output them. const int codePointCount = readingHelper.getTotalCodePointCount(); for (int i = 0; i < codePointCount; ++i) { outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; } Loading @@ -121,73 +94,39 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in for (int i = 0; i < length; ++i) { searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; } int mergedNodeCodePoints[MAX_WORD_LENGTH]; int currentLength = 0; int pos = getRootPosition(); DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); while (currentLength < length) { // When foundMatchedNode becomes true, currentLength is increased at least once. bool foundMatchedNode = false; int totalChildCount = 0; do { const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &pos); totalChildCount += childCount; if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); return NOT_A_VALID_WORD_POS; } for (int i = 0; i < childCount; i++) { nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH, mergedNodeCodePoints); const int nodeCodePointCount = nodeReader.getCodePointCount(); if (nodeReader.isDeleted() || nodeCodePointCount <= 0 || currentLength + nodeCodePointCount > length) { // Skip deleted or empty node. pos = nodeReader.getSiblingNodePos(); DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithNodeArrayPos(getRootPosition()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); if (readingHelper.getTotalCodePointCount() > length || !readingHelper.isMatchedCodePoint(0 /* index */, searchCodePoints[matchedCodePointCount])) { // Current node has too many code points or its first code point is different from // target code point. Skip this node and read the next sibling node. readingHelper.readNextSiblingNode(); continue; } bool matched = true; for (int j = 0; j < nodeCodePointCount; ++j) { if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) { // Different code point is found. matched = false; break; // Check following merged node code points. const int nodeCodePointCount = nodeReader->getCodePointCount(); for (int j = 1; j < nodeCodePointCount; ++j) { if (!readingHelper.isMatchedCodePoint( j, searchCodePoints[matchedCodePointCount + j])) { // Different code point is found. The given word is not included in the dictionary. return NOT_A_VALID_WORD_POS; } } if (matched) { currentLength += nodeCodePointCount; if (length == currentLength) { // All characters are matched. if (length == readingHelper.getTotalCodePointCount()) { // Terminal position is found. return nodeReader.getNodePos(); return nodeReader->getNodePos(); } if (!nodeReader.hasChildren()) { if (!nodeReader->hasChildren()) { return NOT_A_VALID_WORD_POS; } foundMatchedNode = true; // Advance to the children nodes. pos = nodeReader.getChildrenPos(); break; } // Try next sibling node. pos = nodeReader.getSiblingNodePos(); } if (foundMatchedNode) { break; } // If the matched node is not found in the current PtNode array, try to follow the // forward link. pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition( mDictRoot, pos); } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos)); if (!foundMatchedNode) { // Matched node is not found. return NOT_A_VALID_WORD_POS; } readingHelper.readChildNode(); } // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). Loading native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +0 −1 Original line number Diff line number Diff line Loading @@ -87,7 +87,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; const MmappedBuffer *const mBuffer; const ExtendableBuffer mExtendableBuffer; Loading native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp 0 → 100644 +83 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" namespace latinime { // To avoid infinite loop caused by invalid or malicious forward links. const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; // Read node array size and process empty node arrays. Nodes and arrays are counted up in this // method to avoid an infinite loop. void DynamicPatriciaTrieReadingHelper::nextNodeArray() { const bool usesAdditionalBuffer = mPos >= mOriginalDictSize; const uint8_t *const dictBuf = (usesAdditionalBuffer) ? mExtendableBuffer->getBuffer() : mDictRoot; if (usesAdditionalBuffer) { mPos -= mOriginalDictSize; } mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf, &mPos); if (usesAdditionalBuffer) { mPos += mOriginalDictSize; } // Count up nodes and node arrays to avoid infinite loop. mTotalNodeCount += mNodeCount; mNodeArrayCount++; if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP || mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); mIsError = true; mPos = NOT_A_DICT_POS; return; } if (mNodeCount == 0) { // Empty node array. Try following forward link. followForwardLink(); } } // Follow the forward link and read the next node array if exists. void DynamicPatriciaTrieReadingHelper::followForwardLink() { const bool usesAdditionalBuffer = mPos >= mOriginalDictSize; const uint8_t *const dictBuf = (usesAdditionalBuffer) ? mExtendableBuffer->getBuffer() : mDictRoot; if (usesAdditionalBuffer) { mPos -= mOriginalDictSize; } const int forwardLinkPosition = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos); if (usesAdditionalBuffer) { mPos += mOriginalDictSize; } if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { // Follow the forward link. mPos = forwardLinkPosition; nextNodeArray(); } else { // All node arrays have been read. mPos = NOT_A_DICT_POS; } } } // namespace latinime native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h 0 → 100644 +201 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H #include "defines.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" namespace latinime { /* * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. */ class DynamicPatriciaTrieReadingHelper { public: DynamicPatriciaTrieReadingHelper(const uint8_t *const dictRoot, const int originalDictSize, const ExtendableBuffer *const extendableBuffer, const DictionaryBigramsStructurePolicy *const bigramsPolicy, const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) : mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), mTotalNodeCount(0), mNodeArrayCount(0), mDictRoot(dictRoot), mOriginalDictSize(originalDictSize), mExtendableBuffer(extendableBuffer), mNodeReader(mDictRoot, mOriginalDictSize, mExtendableBuffer, bigramsPolicy, shortcutsPolicy) {} ~DynamicPatriciaTrieReadingHelper() {} AK_FORCE_INLINE bool isError() const { return mIsError; } AK_FORCE_INLINE bool isEnd() const { return mPos == NOT_A_DICT_POS; } // Initialize reading state with the head position of a node array. AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) { if (nodeArrayPos == NOT_A_DICT_POS) { mPos = NOT_A_DICT_POS; } else { mIsError = false; mPos = nodeArrayPos; mNodeCount = 0; mPrevTotalCodePointCount = 0; mTotalNodeCount = 0; mNodeArrayCount = 0; nextNodeArray(); if (!isEnd()) { fetchNodeInfo(); } } } // Initialize reading state with the head position of a node. AK_FORCE_INLINE void initWithNodePos(const int nodePos) { // TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) { mPos = NOT_A_DICT_POS; } else { mIsError = false; mPos = nodePos; mNodeCount = 1; mPrevTotalCodePointCount = 0; mTotalNodeCount = 1; mNodeArrayCount = 1; fetchNodeInfo(); } } AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { return &mNodeReader; } AK_FORCE_INLINE bool isValidTerminalNode() const { return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); } AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { return mMergedNodeCodePoints[index] == codePoint; } // Return code point count exclude the last read node's code points. AK_FORCE_INLINE int getPrevTotalCodePointCount() const { return mPrevTotalCodePointCount; } // Return code point count include the last read node's code points. AK_FORCE_INLINE int getTotalCodePointCount() const { return mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); } AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( const int index, int *const outCodePoints) const { const int nodeCodePointCount = mNodeReader.getCodePointCount(); for (int i = 0; i < nodeCodePointCount; ++i) { outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; } } AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { return mMergedNodeCodePoints; } AK_FORCE_INLINE void readNextSiblingNode() { mNodeCount -= 1; mPos = mNodeReader.getSiblingNodePos(); if (mNodeCount <= 0) { // All nodes in the current node array have been read. followForwardLink(); if (!isEnd()) { fetchNodeInfo(); } } else { fetchNodeInfo(); } } // Read the first child node of the current node. AK_FORCE_INLINE void readChildNode() { if (mNodeReader.hasChildren()) { mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); mTotalNodeCount = 0; mNodeArrayCount = 0; mPos = mNodeReader.getChildrenPos(); // Read children node array. nextNodeArray(); if (!isEnd()) { fetchNodeInfo(); } } else { mPos = NOT_A_DICT_POS; } } // Read the parent node of the current node. AK_FORCE_INLINE void readParentNode() { if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); mTotalNodeCount = 1; mNodeArrayCount = 1; mNodeCount = 1; mPos = mNodeReader.getParentPos(); fetchNodeInfo(); } else { mPos = NOT_A_DICT_POS; } } private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; bool mIsError; int mPos; // Node count of a node array. int mNodeCount; int mPrevTotalCodePointCount; int mTotalNodeCount; int mNodeArrayCount; const uint8_t *const mDictRoot; const int mOriginalDictSize; const ExtendableBuffer *const mExtendableBuffer; DynamicPatriciaTrieNodeReader mNodeReader; int mMergedNodeCodePoints[MAX_WORD_LENGTH]; void nextNodeArray(); void followForwardLink(); AK_FORCE_INLINE void fetchNodeInfo() { mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH, mMergedNodeCodePoints); if (mNodeReader.getCodePointCount() <= 0) { // Empty node is not allowed. mIsError = true; mPos = NOT_A_DICT_POS; } } }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ Loading
native/jni/Android.mk +1 −0 Original line number Diff line number Diff line Loading @@ -74,6 +74,7 @@ LATIN_IME_CORE_SRC_FILES := \ dictionary_structure_with_buffer_policy_factory.cpp \ dynamic_patricia_trie_node_reader.cpp \ dynamic_patricia_trie_policy.cpp \ dynamic_patricia_trie_reading_helper.cpp \ dynamic_patricia_trie_reading_utils.cpp \ patricia_trie_policy.cpp \ patricia_trie_reading_utils.cpp) \ Loading
native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +65 −126 Original line number Diff line number Diff line Loading @@ -20,95 +20,68 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { // To avoid infinite loop caused by invalid or malicious forward links. const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); int mergedNodeCodePoints[MAX_WORD_LENGTH]; int nextPos = dicNode->getChildrenPos(); int totalChildCount = 0; do { const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &nextPos); totalChildCount += childCount; if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); return; DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(), nodeReader->getChildrenPos(), nodeReader->getProbability(), nodeReader->isTerminal() && !nodeReader->isDeleted(), nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); readingHelper.readNextSiblingNode(); } for (int i = 0; i < childCount; i++) { nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH, mergedNodeCodePoints); if (!nodeReader.isDeleted()) { // Push child node when the node is not a deleted node. childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(), nodeReader.getChildrenPos(), nodeReader.getProbability(), nodeReader.isTerminal(), nodeReader.hasChildren(), nodeReader.isBlacklisted() || nodeReader.isNotAWord(), nodeReader.getCodePointCount(), mergedNodeCodePoints); } nextPos = nodeReader.getSiblingNodePos(); } nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(mDictRoot, nextPos); } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos)); } int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { if (nodePos == NOT_A_VALID_WORD_POS) { *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. int reverseCodePoints[maxCodePointCount]; int mergedNodeCodePoints[maxCodePointCount]; int codePointCount = 0; DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); // First, read terminal node and get its probability. nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount, mergedNodeCodePoints); DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); // First, read the terminal node and get its probability. readingHelper.initWithNodePos(nodePos); if (!readingHelper.isValidTerminalNode()) { // Node at the nodePos is not a valid terminal node. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Store terminal node probability. *outUnigramProbability = nodeReader.getProbability(); // Store terminal node code points to buffer in the reverse order. for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; } // Then, follow parent pos toward the root node. while (nodeReader.getParentPos() != NOT_A_DICT_POS) { // codePointCount must be incremented at least once in each iteration to ensure preventing // infinite loop. if (nodeReader.isDeleted() || codePointCount > maxCodePointCount || nodeReader.getCodePointCount() <= 0) { *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); // Then, following parent node link to the dictionary root and fetch node code points. while (!readingHelper.isEnd()) { if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { // The nodePos is not a valid terminal node position in the dictionary. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Read parent node. nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(), maxCodePointCount, mergedNodeCodePoints); // Store node code points to buffer in the reverse order. for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; readingHelper.fetchMergedNodeCodePointsInReverseOrder( readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); // Follow parent node toward the root node. readingHelper.readParentNode(); } if (readingHelper.isError()) { // The node position or the dictionary is invalid. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Reverse the stored code points to output them. const int codePointCount = readingHelper.getTotalCodePointCount(); for (int i = 0; i < codePointCount; ++i) { outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; } Loading @@ -121,73 +94,39 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in for (int i = 0; i < length; ++i) { searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; } int mergedNodeCodePoints[MAX_WORD_LENGTH]; int currentLength = 0; int pos = getRootPosition(); DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); while (currentLength < length) { // When foundMatchedNode becomes true, currentLength is increased at least once. bool foundMatchedNode = false; int totalChildCount = 0; do { const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &pos); totalChildCount += childCount; if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); return NOT_A_VALID_WORD_POS; } for (int i = 0; i < childCount; i++) { nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH, mergedNodeCodePoints); const int nodeCodePointCount = nodeReader.getCodePointCount(); if (nodeReader.isDeleted() || nodeCodePointCount <= 0 || currentLength + nodeCodePointCount > length) { // Skip deleted or empty node. pos = nodeReader.getSiblingNodePos(); DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize, &mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithNodeArrayPos(getRootPosition()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); if (readingHelper.getTotalCodePointCount() > length || !readingHelper.isMatchedCodePoint(0 /* index */, searchCodePoints[matchedCodePointCount])) { // Current node has too many code points or its first code point is different from // target code point. Skip this node and read the next sibling node. readingHelper.readNextSiblingNode(); continue; } bool matched = true; for (int j = 0; j < nodeCodePointCount; ++j) { if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) { // Different code point is found. matched = false; break; // Check following merged node code points. const int nodeCodePointCount = nodeReader->getCodePointCount(); for (int j = 1; j < nodeCodePointCount; ++j) { if (!readingHelper.isMatchedCodePoint( j, searchCodePoints[matchedCodePointCount + j])) { // Different code point is found. The given word is not included in the dictionary. return NOT_A_VALID_WORD_POS; } } if (matched) { currentLength += nodeCodePointCount; if (length == currentLength) { // All characters are matched. if (length == readingHelper.getTotalCodePointCount()) { // Terminal position is found. return nodeReader.getNodePos(); return nodeReader->getNodePos(); } if (!nodeReader.hasChildren()) { if (!nodeReader->hasChildren()) { return NOT_A_VALID_WORD_POS; } foundMatchedNode = true; // Advance to the children nodes. pos = nodeReader.getChildrenPos(); break; } // Try next sibling node. pos = nodeReader.getSiblingNodePos(); } if (foundMatchedNode) { break; } // If the matched node is not found in the current PtNode array, try to follow the // forward link. pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition( mDictRoot, pos); } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos)); if (!foundMatchedNode) { // Matched node is not found. return NOT_A_VALID_WORD_POS; } readingHelper.readChildNode(); } // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). Loading
native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +0 −1 Original line number Diff line number Diff line Loading @@ -87,7 +87,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; const MmappedBuffer *const mBuffer; const ExtendableBuffer mExtendableBuffer; Loading
native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp 0 → 100644 +83 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" namespace latinime { // To avoid infinite loop caused by invalid or malicious forward links. const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; // Read node array size and process empty node arrays. Nodes and arrays are counted up in this // method to avoid an infinite loop. void DynamicPatriciaTrieReadingHelper::nextNodeArray() { const bool usesAdditionalBuffer = mPos >= mOriginalDictSize; const uint8_t *const dictBuf = (usesAdditionalBuffer) ? mExtendableBuffer->getBuffer() : mDictRoot; if (usesAdditionalBuffer) { mPos -= mOriginalDictSize; } mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf, &mPos); if (usesAdditionalBuffer) { mPos += mOriginalDictSize; } // Count up nodes and node arrays to avoid infinite loop. mTotalNodeCount += mNodeCount; mNodeArrayCount++; if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP || mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); mIsError = true; mPos = NOT_A_DICT_POS; return; } if (mNodeCount == 0) { // Empty node array. Try following forward link. followForwardLink(); } } // Follow the forward link and read the next node array if exists. void DynamicPatriciaTrieReadingHelper::followForwardLink() { const bool usesAdditionalBuffer = mPos >= mOriginalDictSize; const uint8_t *const dictBuf = (usesAdditionalBuffer) ? mExtendableBuffer->getBuffer() : mDictRoot; if (usesAdditionalBuffer) { mPos -= mOriginalDictSize; } const int forwardLinkPosition = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos); if (usesAdditionalBuffer) { mPos += mOriginalDictSize; } if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { // Follow the forward link. mPos = forwardLinkPosition; nextNodeArray(); } else { // All node arrays have been read. mPos = NOT_A_DICT_POS; } } } // namespace latinime
native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h 0 → 100644 +201 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H #include "defines.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" namespace latinime { /* * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. */ class DynamicPatriciaTrieReadingHelper { public: DynamicPatriciaTrieReadingHelper(const uint8_t *const dictRoot, const int originalDictSize, const ExtendableBuffer *const extendableBuffer, const DictionaryBigramsStructurePolicy *const bigramsPolicy, const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) : mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), mTotalNodeCount(0), mNodeArrayCount(0), mDictRoot(dictRoot), mOriginalDictSize(originalDictSize), mExtendableBuffer(extendableBuffer), mNodeReader(mDictRoot, mOriginalDictSize, mExtendableBuffer, bigramsPolicy, shortcutsPolicy) {} ~DynamicPatriciaTrieReadingHelper() {} AK_FORCE_INLINE bool isError() const { return mIsError; } AK_FORCE_INLINE bool isEnd() const { return mPos == NOT_A_DICT_POS; } // Initialize reading state with the head position of a node array. AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) { if (nodeArrayPos == NOT_A_DICT_POS) { mPos = NOT_A_DICT_POS; } else { mIsError = false; mPos = nodeArrayPos; mNodeCount = 0; mPrevTotalCodePointCount = 0; mTotalNodeCount = 0; mNodeArrayCount = 0; nextNodeArray(); if (!isEnd()) { fetchNodeInfo(); } } } // Initialize reading state with the head position of a node. AK_FORCE_INLINE void initWithNodePos(const int nodePos) { // TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) { mPos = NOT_A_DICT_POS; } else { mIsError = false; mPos = nodePos; mNodeCount = 1; mPrevTotalCodePointCount = 0; mTotalNodeCount = 1; mNodeArrayCount = 1; fetchNodeInfo(); } } AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { return &mNodeReader; } AK_FORCE_INLINE bool isValidTerminalNode() const { return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); } AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { return mMergedNodeCodePoints[index] == codePoint; } // Return code point count exclude the last read node's code points. AK_FORCE_INLINE int getPrevTotalCodePointCount() const { return mPrevTotalCodePointCount; } // Return code point count include the last read node's code points. AK_FORCE_INLINE int getTotalCodePointCount() const { return mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); } AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( const int index, int *const outCodePoints) const { const int nodeCodePointCount = mNodeReader.getCodePointCount(); for (int i = 0; i < nodeCodePointCount; ++i) { outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; } } AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { return mMergedNodeCodePoints; } AK_FORCE_INLINE void readNextSiblingNode() { mNodeCount -= 1; mPos = mNodeReader.getSiblingNodePos(); if (mNodeCount <= 0) { // All nodes in the current node array have been read. followForwardLink(); if (!isEnd()) { fetchNodeInfo(); } } else { fetchNodeInfo(); } } // Read the first child node of the current node. AK_FORCE_INLINE void readChildNode() { if (mNodeReader.hasChildren()) { mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); mTotalNodeCount = 0; mNodeArrayCount = 0; mPos = mNodeReader.getChildrenPos(); // Read children node array. nextNodeArray(); if (!isEnd()) { fetchNodeInfo(); } } else { mPos = NOT_A_DICT_POS; } } // Read the parent node of the current node. AK_FORCE_INLINE void readParentNode() { if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); mTotalNodeCount = 1; mNodeArrayCount = 1; mNodeCount = 1; mPos = mNodeReader.getParentPos(); fetchNodeInfo(); } else { mPos = NOT_A_DICT_POS; } } private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; bool mIsError; int mPos; // Node count of a node array. int mNodeCount; int mPrevTotalCodePointCount; int mTotalNodeCount; int mNodeArrayCount; const uint8_t *const mDictRoot; const int mOriginalDictSize; const ExtendableBuffer *const mExtendableBuffer; DynamicPatriciaTrieNodeReader mNodeReader; int mMergedNodeCodePoints[MAX_WORD_LENGTH]; void nextNodeArray(); void followForwardLink(); AK_FORCE_INLINE void fetchNodeInfo() { mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH, mMergedNodeCodePoints); if (mNodeReader.getCodePointCount() <= 0) { // Empty node is not allowed. mIsError = true; mPos = NOT_A_DICT_POS; } } }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */