Loading tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java +118 −116 Original line number Diff line number Diff line Loading @@ -41,18 +41,17 @@ import javax.xml.parsers.SAXParserFactory; * in the data. There is no need to increase the version when only the words in the data changes. */ public class MakeBinaryDictionary { private static final int VERSION_NUM = 200; public static final int ALPHA_SIZE = 256; public static final String TAG_WORD = "w"; public static final String ATTR_FREQ = "f"; private static final String TAG_WORD = "w"; private static final String ATTR_FREQ = "f"; private static final int FLAG_ADDRESS_MASK = 0x400000; private static final int FLAG_TERMINAL_MASK = 0x800000; private static final int ADDRESS_MASK = 0x3FFFFF; private static final int INITIAL_STRING_BUILDER_CAPACITY = 48; /** * Unit for this variable is in bytes * If destination file name is main.dict and file limit causes dictionary to be separated into Loading @@ -61,15 +60,15 @@ public class MakeBinaryDictionary { private static int sOutputFileSize; private static boolean sSplitOutput; public static final CharNode EMPTY_NODE = new CharNode(); private static final CharNode EMPTY_NODE = new CharNode(); List<CharNode> roots; Map<String, Integer> mDictionary; int mWordCount; private List<CharNode> mRoots; private Map<String, Integer> mDictionary; private int mWordCount; BigramDictionary bigramDict; private BigramDictionary mBigramDict; static class CharNode { private static class CharNode { char data; int freq; boolean terminal; Loading @@ -81,7 +80,7 @@ public class MakeBinaryDictionary { } } public static void usage() { private static void usage() { System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] " + "-d <dest.dict> [--size filesize]"); System.exit(-1); Loading Loading @@ -118,36 +117,37 @@ public class MakeBinaryDictionary { } } public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){ private MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename) { System.out.println("Generating dictionary version " + VERSION_NUM); bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null)); mBigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null)); populateDictionary(srcFilename); writeToDict(destFilename); // Enable the code below to verify that the generated tree is traversable // and bigram data is stored correctly. if (false) { bigramDict.reverseLookupAll(mDictionary, dict); mBigramDict.reverseLookupAll(mDictionary, mDict); traverseDict(2, new char[32], 0); } } private void populateDictionary(String filename) { roots = new ArrayList<CharNode>(); mRoots = new ArrayList<CharNode>(); mDictionary = new HashMap<String, Integer>(); try { SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(new File(filename), new DefaultHandler() { boolean inWord; int freq; StringBuilder wordBuilder = new StringBuilder(48); StringBuilder wordBuilder = new StringBuilder(INITIAL_STRING_BUILDER_CAPACITY); @Override public void startElement(String uri, String localName, String qName, Attributes attributes) { if (qName.equals("w")) { if (qName.equals(TAG_WORD)) { inWord = true; freq = Integer.parseInt(attributes.getValue(0)); freq = Integer.parseInt(attributes.getValue(ATTR_FREQ)); wordBuilder.setLength(0); } } Loading @@ -162,7 +162,7 @@ public class MakeBinaryDictionary { @Override public void endElement(String uri, String localName, String qName) { if (qName.equals("w")) { if (qName.equals(TAG_WORD)) { if (wordBuilder.length() >= 1) { addWordTop(wordBuilder.toString(), freq); mWordCount++; Loading @@ -178,7 +178,7 @@ public class MakeBinaryDictionary { System.out.println("Nodes = " + CharNode.sNodes); } private int indexOf(List<CharNode> children, char c) { private static int indexOf(List<CharNode> children, char c) { if (children == null) { return -1; } Loading @@ -190,27 +190,30 @@ public class MakeBinaryDictionary { return -1; } private void addWordTop(String word, int occur) { if (occur > 255) occur = 255; private void addWordTop(String word, int freq) { if (freq < 0) { freq = 0; } else if (freq > 255) { freq = 255; } char firstChar = word.charAt(0); int index = indexOf(roots, firstChar); int index = indexOf(mRoots, firstChar); if (index == -1) { CharNode newNode = new CharNode(); newNode.data = firstChar; newNode.freq = occur; index = roots.size(); roots.add(newNode); } else { roots.get(index).freq += occur; index = mRoots.size(); mRoots.add(newNode); } final CharNode node = mRoots.get(index); if (word.length() > 1) { addWordRec(roots.get(index), word, 1, occur); addWordRec(node, word, 1, freq); } else { roots.get(index).terminal = true; node.terminal = true; node.freq = freq; } } private void addWordRec(CharNode parent, String word, int charAt, int occur) { private void addWordRec(CharNode parent, String word, int charAt, int freq) { CharNode child = null; char data = word.charAt(charAt); if (parent.children == null) { Loading @@ -229,89 +232,89 @@ public class MakeBinaryDictionary { parent.children.add(child); } child.data = data; if (child.freq == 0) child.freq = occur; if (word.length() > charAt + 1) { addWordRec(child, word, charAt + 1, occur); addWordRec(child, word, charAt + 1, freq); } else { child.terminal = true; child.freq = occur; child.freq = freq; } } byte[] dict; int dictSize; static final int CHAR_WIDTH = 8; static final int FLAGS_WIDTH = 1; // Terminal flag (word end) static final int ADDR_WIDTH = 23; // Offset to children static final int FREQ_WIDTH_BYTES = 1; static final int COUNT_WIDTH_BYTES = 1; private byte[] mDict; private int mDictSize; private static final int CHAR_WIDTH = 8; private static final int FLAGS_WIDTH = 1; // Terminal flag (word end) private static final int ADDR_WIDTH = 23; // Offset to children private static final int FREQ_WIDTH_BYTES = 1; private static final int COUNT_WIDTH_BYTES = 1; private void addCount(int count) { dict[dictSize++] = (byte) (0xFF & count); mDict[mDictSize++] = (byte) (0xFF & count); } private void addNode(CharNode node, String word1) { if (node.terminal) { // store address of each word1 mDictionary.put(word1, dictSize); if (node.terminal) { // store address of each word1 for bigram dic generation mDictionary.put(word1, mDictSize); } int charData = 0xFFFF & node.data; if (charData > 254) { dict[dictSize++] = (byte) 255; dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF); dict[dictSize++] = (byte) (node.data & 0xFF); mDict[mDictSize++] = (byte) 255; mDict[mDictSize++] = (byte) ((node.data >> 8) & 0xFF); mDict[mDictSize++] = (byte) (node.data & 0xFF); } else { dict[dictSize++] = (byte) (0xFF & node.data); mDict[mDictSize++] = (byte) (0xFF & node.data); } if (node.children != null) { dictSize += 3; // Space for children address mDictSize += 3; // Space for children address } else { dictSize += 1; // Space for just the terminal/address flags mDictSize += 1; // Space for just the terminal/address flags } if ((0xFFFFFF & node.freq) > 255) { node.freq = 255; } if (node.terminal) { byte freq = (byte) (0xFF & node.freq); dict[dictSize++] = freq; mDict[mDictSize++] = freq; // bigram if (bigramDict.mBi.containsKey(word1)) { int count = bigramDict.mBi.get(word1).count; bigramDict.mBigramToFill.add(word1); bigramDict.mBigramToFillAddress.add(dictSize); dictSize += (4 * count); if (mBigramDict.mBi.containsKey(word1)) { int count = mBigramDict.mBi.get(word1).count; mBigramDict.mBigramToFill.add(word1); mBigramDict.mBigramToFillAddress.add(mDictSize); mDictSize += (4 * count); } else { dict[dictSize++] = (byte) (0x00); mDict[mDictSize++] = (byte) (0x00); } } } int nullChildrenCount = 0; int notTerminalCount = 0; private int mNullChildrenCount = 0; private int mNotTerminalCount = 0; private void updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress) { if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character if ((mDict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character nodeAddress += 2; } childrenAddress = ADDRESS_MASK & childrenAddress; if (childrenAddress == 0) { nullChildrenCount++; mNullChildrenCount++; } else { childrenAddress |= FLAG_ADDRESS_MASK; } if (node.terminal) { childrenAddress |= FLAG_TERMINAL_MASK; } else { notTerminalCount++; mNotTerminalCount++; } dict[nodeAddress + 1] = (byte) (childrenAddress >> 16); mDict[nodeAddress + 1] = (byte) (childrenAddress >> 16); if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) { dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); mDict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); mDict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); } } void writeWordsRec(List<CharNode> children, StringBuilder word) { private void writeWordsRec(List<CharNode> children, StringBuilder word) { if (children == null || children.size() == 0) { return; } Loading @@ -319,60 +322,59 @@ public class MakeBinaryDictionary { addCount(childCount); int[] childrenAddresses = new int[childCount]; for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); childrenAddresses[j] = dictSize; word.append(children.get(j).data); addNode(node, word.toString()); word.deleteCharAt(word.length()-1); CharNode child = children.get(j); childrenAddresses[j] = mDictSize; word.append(child.data); addNode(child, word.toString()); word.setLength(word.length() - 1); } for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); CharNode child = children.get(j); int nodeAddress = childrenAddresses[j]; int cacheDictSize = dictSize; word.append(children.get(j).data); writeWordsRec(node.children, word); word.deleteCharAt(word.length()-1); updateNodeAddress(nodeAddress, node, node.children != null ? cacheDictSize : 0); int cacheDictSize = mDictSize; word.append(child.data); writeWordsRec(child.children, word); word.setLength(word.length() - 1); updateNodeAddress(nodeAddress, child, child.children != null ? cacheDictSize : 0); } } void writeToDict(String dictFilename) { private void writeToDict(String dictFilename) { // 4MB max, 22-bit offsets dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably mDict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably // < 1MB in most cases, as there is a limit in the // resource size in apks. dictSize = 0; mDictSize = 0; dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0)); mDict[mDictSize++] = (byte) (0xFF & VERSION_NUM); // version info mDict[mDictSize++] = (byte) (0xFF & (mBigramDict.mHasBigram ? 1 : 0)); StringBuilder word = new StringBuilder(48); writeWordsRec(roots, word); dict = bigramDict.writeBigrams(dict, mDictionary); System.out.println("Dict Size = " + dictSize); final StringBuilder word = new StringBuilder(INITIAL_STRING_BUILDER_CAPACITY); writeWordsRec(mRoots, word); mDict = mBigramDict.writeBigrams(mDict, mDictionary); System.out.println("Dict Size = " + mDictSize); if (!sSplitOutput) { sOutputFileSize = dictSize; sOutputFileSize = mDictSize; } try { int currentLoc = 0; int i = 0; int extension = dictFilename.indexOf(".dict"); String filename = dictFilename.substring(0, extension); while (dictSize > 0) { while (mDictSize > 0) { FileOutputStream fos; if (sSplitOutput) { fos = new FileOutputStream(filename + i + ".dict"); } else { fos = new FileOutputStream(filename + ".dict"); } if (dictSize > sOutputFileSize) { fos.write(dict, currentLoc, sOutputFileSize); dictSize -= sOutputFileSize; if (mDictSize > sOutputFileSize) { fos.write(mDict, currentLoc, sOutputFileSize); mDictSize -= sOutputFileSize; currentLoc += sOutputFileSize; } else { fos.write(dict, currentLoc, dictSize); dictSize = 0; fos.write(mDict, currentLoc, mDictSize); mDictSize = 0; } fos.close(); i++; Loading @@ -382,36 +384,36 @@ public class MakeBinaryDictionary { } } void traverseDict(int pos, char[] word, int depth) { int count = dict[pos++] & 0xFF; private void traverseDict(int pos, char[] word, int depth) { int count = mDict[pos++] & 0xFF; for (int i = 0; i < count; i++) { char c = (char) (dict[pos++] & 0xFF); char c = (char) (mDict[pos++] & 0xFF); if (c == 0xFF) { // two byte character c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF)); c = (char) (((mDict[pos] & 0xFF) << 8) | (mDict[pos+1] & 0xFF)); pos += 2; } word[depth] = c; boolean terminal = getFirstBitOfByte(pos, dict); boolean terminal = getFirstBitOfByte(pos, mDict); int address = 0; if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check address = get22BitAddress(pos, dict); if ((mDict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check address = get22BitAddress(pos, mDict); pos += 3; } else { pos += 1; } if (terminal) { showWord(word, depth + 1, dict[pos] & 0xFF); showWord(word, depth + 1, mDict[pos] & 0xFF); pos++; int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ); int bigramExist = (mDict[pos] & mBigramDict.FLAG_BIGRAM_READ); if (bigramExist > 0) { int nextBigramExist = 1; while (nextBigramExist > 0) { int bigramAddress = get22BitAddress(pos, dict); int bigramAddress = get22BitAddress(pos, mDict); pos += 3; int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]); bigramDict.searchForTerminalNode(bigramAddress, frequency, dict); nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED); int frequency = (mBigramDict.FLAG_BIGRAM_FREQ & mDict[pos]); mBigramDict.searchForTerminalNode(bigramAddress, frequency, mDict); nextBigramExist = (mDict[pos++] & mBigramDict.FLAG_BIGRAM_CONTINUED); } } else { pos++; Loading @@ -423,21 +425,21 @@ public class MakeBinaryDictionary { } } void showWord(char[] word, int size, int freq) { private static void showWord(char[] word, int size, int freq) { System.out.print(new String(word, 0, size) + " " + freq + "\n"); } static int get22BitAddress(int pos, byte[] dict) { /* package */ static int get22BitAddress(int pos, byte[] dict) { return ((dict[pos + 0] & 0x3F) << 16) | ((dict[pos + 1] & 0xFF) << 8) | ((dict[pos + 2] & 0xFF)); } static boolean getFirstBitOfByte(int pos, byte[] dict) { /* package */ static boolean getFirstBitOfByte(int pos, byte[] dict) { return (dict[pos] & 0x80) > 0; } static boolean getSecondBitOfByte(int pos, byte[] dict) { /* package */ static boolean getSecondBitOfByte(int pos, byte[] dict) { return (dict[pos] & 0x40) > 0; } } Loading
tools/makedict/src/com/android/tools/dict/MakeBinaryDictionary.java +118 −116 Original line number Diff line number Diff line Loading @@ -41,18 +41,17 @@ import javax.xml.parsers.SAXParserFactory; * in the data. There is no need to increase the version when only the words in the data changes. */ public class MakeBinaryDictionary { private static final int VERSION_NUM = 200; public static final int ALPHA_SIZE = 256; public static final String TAG_WORD = "w"; public static final String ATTR_FREQ = "f"; private static final String TAG_WORD = "w"; private static final String ATTR_FREQ = "f"; private static final int FLAG_ADDRESS_MASK = 0x400000; private static final int FLAG_TERMINAL_MASK = 0x800000; private static final int ADDRESS_MASK = 0x3FFFFF; private static final int INITIAL_STRING_BUILDER_CAPACITY = 48; /** * Unit for this variable is in bytes * If destination file name is main.dict and file limit causes dictionary to be separated into Loading @@ -61,15 +60,15 @@ public class MakeBinaryDictionary { private static int sOutputFileSize; private static boolean sSplitOutput; public static final CharNode EMPTY_NODE = new CharNode(); private static final CharNode EMPTY_NODE = new CharNode(); List<CharNode> roots; Map<String, Integer> mDictionary; int mWordCount; private List<CharNode> mRoots; private Map<String, Integer> mDictionary; private int mWordCount; BigramDictionary bigramDict; private BigramDictionary mBigramDict; static class CharNode { private static class CharNode { char data; int freq; boolean terminal; Loading @@ -81,7 +80,7 @@ public class MakeBinaryDictionary { } } public static void usage() { private static void usage() { System.err.println("Usage: makedict -s <src_dict.xml> [-b <src_bigram.xml>] " + "-d <dest.dict> [--size filesize]"); System.exit(-1); Loading Loading @@ -118,36 +117,37 @@ public class MakeBinaryDictionary { } } public MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename){ private MakeBinaryDictionary(String srcFilename, String bigramSrcFilename, String destFilename) { System.out.println("Generating dictionary version " + VERSION_NUM); bigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null)); mBigramDict = new BigramDictionary(bigramSrcFilename, (bigramSrcFilename != null)); populateDictionary(srcFilename); writeToDict(destFilename); // Enable the code below to verify that the generated tree is traversable // and bigram data is stored correctly. if (false) { bigramDict.reverseLookupAll(mDictionary, dict); mBigramDict.reverseLookupAll(mDictionary, mDict); traverseDict(2, new char[32], 0); } } private void populateDictionary(String filename) { roots = new ArrayList<CharNode>(); mRoots = new ArrayList<CharNode>(); mDictionary = new HashMap<String, Integer>(); try { SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(new File(filename), new DefaultHandler() { boolean inWord; int freq; StringBuilder wordBuilder = new StringBuilder(48); StringBuilder wordBuilder = new StringBuilder(INITIAL_STRING_BUILDER_CAPACITY); @Override public void startElement(String uri, String localName, String qName, Attributes attributes) { if (qName.equals("w")) { if (qName.equals(TAG_WORD)) { inWord = true; freq = Integer.parseInt(attributes.getValue(0)); freq = Integer.parseInt(attributes.getValue(ATTR_FREQ)); wordBuilder.setLength(0); } } Loading @@ -162,7 +162,7 @@ public class MakeBinaryDictionary { @Override public void endElement(String uri, String localName, String qName) { if (qName.equals("w")) { if (qName.equals(TAG_WORD)) { if (wordBuilder.length() >= 1) { addWordTop(wordBuilder.toString(), freq); mWordCount++; Loading @@ -178,7 +178,7 @@ public class MakeBinaryDictionary { System.out.println("Nodes = " + CharNode.sNodes); } private int indexOf(List<CharNode> children, char c) { private static int indexOf(List<CharNode> children, char c) { if (children == null) { return -1; } Loading @@ -190,27 +190,30 @@ public class MakeBinaryDictionary { return -1; } private void addWordTop(String word, int occur) { if (occur > 255) occur = 255; private void addWordTop(String word, int freq) { if (freq < 0) { freq = 0; } else if (freq > 255) { freq = 255; } char firstChar = word.charAt(0); int index = indexOf(roots, firstChar); int index = indexOf(mRoots, firstChar); if (index == -1) { CharNode newNode = new CharNode(); newNode.data = firstChar; newNode.freq = occur; index = roots.size(); roots.add(newNode); } else { roots.get(index).freq += occur; index = mRoots.size(); mRoots.add(newNode); } final CharNode node = mRoots.get(index); if (word.length() > 1) { addWordRec(roots.get(index), word, 1, occur); addWordRec(node, word, 1, freq); } else { roots.get(index).terminal = true; node.terminal = true; node.freq = freq; } } private void addWordRec(CharNode parent, String word, int charAt, int occur) { private void addWordRec(CharNode parent, String word, int charAt, int freq) { CharNode child = null; char data = word.charAt(charAt); if (parent.children == null) { Loading @@ -229,89 +232,89 @@ public class MakeBinaryDictionary { parent.children.add(child); } child.data = data; if (child.freq == 0) child.freq = occur; if (word.length() > charAt + 1) { addWordRec(child, word, charAt + 1, occur); addWordRec(child, word, charAt + 1, freq); } else { child.terminal = true; child.freq = occur; child.freq = freq; } } byte[] dict; int dictSize; static final int CHAR_WIDTH = 8; static final int FLAGS_WIDTH = 1; // Terminal flag (word end) static final int ADDR_WIDTH = 23; // Offset to children static final int FREQ_WIDTH_BYTES = 1; static final int COUNT_WIDTH_BYTES = 1; private byte[] mDict; private int mDictSize; private static final int CHAR_WIDTH = 8; private static final int FLAGS_WIDTH = 1; // Terminal flag (word end) private static final int ADDR_WIDTH = 23; // Offset to children private static final int FREQ_WIDTH_BYTES = 1; private static final int COUNT_WIDTH_BYTES = 1; private void addCount(int count) { dict[dictSize++] = (byte) (0xFF & count); mDict[mDictSize++] = (byte) (0xFF & count); } private void addNode(CharNode node, String word1) { if (node.terminal) { // store address of each word1 mDictionary.put(word1, dictSize); if (node.terminal) { // store address of each word1 for bigram dic generation mDictionary.put(word1, mDictSize); } int charData = 0xFFFF & node.data; if (charData > 254) { dict[dictSize++] = (byte) 255; dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF); dict[dictSize++] = (byte) (node.data & 0xFF); mDict[mDictSize++] = (byte) 255; mDict[mDictSize++] = (byte) ((node.data >> 8) & 0xFF); mDict[mDictSize++] = (byte) (node.data & 0xFF); } else { dict[dictSize++] = (byte) (0xFF & node.data); mDict[mDictSize++] = (byte) (0xFF & node.data); } if (node.children != null) { dictSize += 3; // Space for children address mDictSize += 3; // Space for children address } else { dictSize += 1; // Space for just the terminal/address flags mDictSize += 1; // Space for just the terminal/address flags } if ((0xFFFFFF & node.freq) > 255) { node.freq = 255; } if (node.terminal) { byte freq = (byte) (0xFF & node.freq); dict[dictSize++] = freq; mDict[mDictSize++] = freq; // bigram if (bigramDict.mBi.containsKey(word1)) { int count = bigramDict.mBi.get(word1).count; bigramDict.mBigramToFill.add(word1); bigramDict.mBigramToFillAddress.add(dictSize); dictSize += (4 * count); if (mBigramDict.mBi.containsKey(word1)) { int count = mBigramDict.mBi.get(word1).count; mBigramDict.mBigramToFill.add(word1); mBigramDict.mBigramToFillAddress.add(mDictSize); mDictSize += (4 * count); } else { dict[dictSize++] = (byte) (0x00); mDict[mDictSize++] = (byte) (0x00); } } } int nullChildrenCount = 0; int notTerminalCount = 0; private int mNullChildrenCount = 0; private int mNotTerminalCount = 0; private void updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress) { if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character if ((mDict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character nodeAddress += 2; } childrenAddress = ADDRESS_MASK & childrenAddress; if (childrenAddress == 0) { nullChildrenCount++; mNullChildrenCount++; } else { childrenAddress |= FLAG_ADDRESS_MASK; } if (node.terminal) { childrenAddress |= FLAG_TERMINAL_MASK; } else { notTerminalCount++; mNotTerminalCount++; } dict[nodeAddress + 1] = (byte) (childrenAddress >> 16); mDict[nodeAddress + 1] = (byte) (childrenAddress >> 16); if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) { dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); mDict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); mDict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); } } void writeWordsRec(List<CharNode> children, StringBuilder word) { private void writeWordsRec(List<CharNode> children, StringBuilder word) { if (children == null || children.size() == 0) { return; } Loading @@ -319,60 +322,59 @@ public class MakeBinaryDictionary { addCount(childCount); int[] childrenAddresses = new int[childCount]; for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); childrenAddresses[j] = dictSize; word.append(children.get(j).data); addNode(node, word.toString()); word.deleteCharAt(word.length()-1); CharNode child = children.get(j); childrenAddresses[j] = mDictSize; word.append(child.data); addNode(child, word.toString()); word.setLength(word.length() - 1); } for (int j = 0; j < childCount; j++) { CharNode node = children.get(j); CharNode child = children.get(j); int nodeAddress = childrenAddresses[j]; int cacheDictSize = dictSize; word.append(children.get(j).data); writeWordsRec(node.children, word); word.deleteCharAt(word.length()-1); updateNodeAddress(nodeAddress, node, node.children != null ? cacheDictSize : 0); int cacheDictSize = mDictSize; word.append(child.data); writeWordsRec(child.children, word); word.setLength(word.length() - 1); updateNodeAddress(nodeAddress, child, child.children != null ? cacheDictSize : 0); } } void writeToDict(String dictFilename) { private void writeToDict(String dictFilename) { // 4MB max, 22-bit offsets dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably mDict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably // < 1MB in most cases, as there is a limit in the // resource size in apks. dictSize = 0; mDictSize = 0; dict[dictSize++] = (byte) (0xFF & VERSION_NUM); // version info dict[dictSize++] = (byte) (0xFF & (bigramDict.mHasBigram ? 1 : 0)); mDict[mDictSize++] = (byte) (0xFF & VERSION_NUM); // version info mDict[mDictSize++] = (byte) (0xFF & (mBigramDict.mHasBigram ? 1 : 0)); StringBuilder word = new StringBuilder(48); writeWordsRec(roots, word); dict = bigramDict.writeBigrams(dict, mDictionary); System.out.println("Dict Size = " + dictSize); final StringBuilder word = new StringBuilder(INITIAL_STRING_BUILDER_CAPACITY); writeWordsRec(mRoots, word); mDict = mBigramDict.writeBigrams(mDict, mDictionary); System.out.println("Dict Size = " + mDictSize); if (!sSplitOutput) { sOutputFileSize = dictSize; sOutputFileSize = mDictSize; } try { int currentLoc = 0; int i = 0; int extension = dictFilename.indexOf(".dict"); String filename = dictFilename.substring(0, extension); while (dictSize > 0) { while (mDictSize > 0) { FileOutputStream fos; if (sSplitOutput) { fos = new FileOutputStream(filename + i + ".dict"); } else { fos = new FileOutputStream(filename + ".dict"); } if (dictSize > sOutputFileSize) { fos.write(dict, currentLoc, sOutputFileSize); dictSize -= sOutputFileSize; if (mDictSize > sOutputFileSize) { fos.write(mDict, currentLoc, sOutputFileSize); mDictSize -= sOutputFileSize; currentLoc += sOutputFileSize; } else { fos.write(dict, currentLoc, dictSize); dictSize = 0; fos.write(mDict, currentLoc, mDictSize); mDictSize = 0; } fos.close(); i++; Loading @@ -382,36 +384,36 @@ public class MakeBinaryDictionary { } } void traverseDict(int pos, char[] word, int depth) { int count = dict[pos++] & 0xFF; private void traverseDict(int pos, char[] word, int depth) { int count = mDict[pos++] & 0xFF; for (int i = 0; i < count; i++) { char c = (char) (dict[pos++] & 0xFF); char c = (char) (mDict[pos++] & 0xFF); if (c == 0xFF) { // two byte character c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF)); c = (char) (((mDict[pos] & 0xFF) << 8) | (mDict[pos+1] & 0xFF)); pos += 2; } word[depth] = c; boolean terminal = getFirstBitOfByte(pos, dict); boolean terminal = getFirstBitOfByte(pos, mDict); int address = 0; if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check address = get22BitAddress(pos, dict); if ((mDict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { // address check address = get22BitAddress(pos, mDict); pos += 3; } else { pos += 1; } if (terminal) { showWord(word, depth + 1, dict[pos] & 0xFF); showWord(word, depth + 1, mDict[pos] & 0xFF); pos++; int bigramExist = (dict[pos] & bigramDict.FLAG_BIGRAM_READ); int bigramExist = (mDict[pos] & mBigramDict.FLAG_BIGRAM_READ); if (bigramExist > 0) { int nextBigramExist = 1; while (nextBigramExist > 0) { int bigramAddress = get22BitAddress(pos, dict); int bigramAddress = get22BitAddress(pos, mDict); pos += 3; int frequency = (bigramDict.FLAG_BIGRAM_FREQ & dict[pos]); bigramDict.searchForTerminalNode(bigramAddress, frequency, dict); nextBigramExist = (dict[pos++] & bigramDict.FLAG_BIGRAM_CONTINUED); int frequency = (mBigramDict.FLAG_BIGRAM_FREQ & mDict[pos]); mBigramDict.searchForTerminalNode(bigramAddress, frequency, mDict); nextBigramExist = (mDict[pos++] & mBigramDict.FLAG_BIGRAM_CONTINUED); } } else { pos++; Loading @@ -423,21 +425,21 @@ public class MakeBinaryDictionary { } } void showWord(char[] word, int size, int freq) { private static void showWord(char[] word, int size, int freq) { System.out.print(new String(word, 0, size) + " " + freq + "\n"); } static int get22BitAddress(int pos, byte[] dict) { /* package */ static int get22BitAddress(int pos, byte[] dict) { return ((dict[pos + 0] & 0x3F) << 16) | ((dict[pos + 1] & 0xFF) << 8) | ((dict[pos + 2] & 0xFF)); } static boolean getFirstBitOfByte(int pos, byte[] dict) { /* package */ static boolean getFirstBitOfByte(int pos, byte[] dict) { return (dict[pos] & 0x80) > 0; } static boolean getSecondBitOfByte(int pos, byte[] dict) { /* package */ static boolean getSecondBitOfByte(int pos, byte[] dict) { return (dict[pos] & 0x40) > 0; } }