Loading java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java +2 −3 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import java.util.Arrays; Loading Loading @@ -57,8 +57,7 @@ public final class ProbabilityInfo { @Override public String toString() { return "f=" + mProbability + (hasHistoricalInfo() ? ",historicalInfo=" + mTimestamp + ":" + mLevel + ":" + mCount : ""); return CombinedFormatUtils.formatProbabilityInfo(this); } @Override Loading java/src/com/android/inputmethod/latin/makedict/WordProperty.java +4 −29 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CollectionUtils; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import com.android.inputmethod.latin.utils.StringUtils; import java.util.ArrayList; Loading Loading @@ -52,8 +53,8 @@ public final class WordProperty implements Comparable<WordProperty> { mBigrams = bigrams; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; mHasBigrams = !bigrams.isEmpty(); mHasShortcuts = !shortcutTargets.isEmpty(); mHasBigrams = bigrams != null && !bigrams.isEmpty(); mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty(); } private static ProbabilityInfo createProbabilityInfoFromArray(final int[] probabilityInfo) { Loading Loading @@ -158,32 +159,6 @@ public final class WordProperty implements Comparable<WordProperty> { @Override public String toString() { // TODO: Move this logic to CombinedInputOutput. final StringBuffer builder = new StringBuffer(); builder.append(" word=" + mWord); builder.append(","); builder.append(mProbabilityInfo.toString()); if (mIsNotAWord) { builder.append(","); builder.append("not_a_word=true"); } if (mIsBlacklistEntry) { builder.append(","); builder.append("blacklisted=true"); } builder.append("\n"); for (int i = 0; i < mBigrams.size(); i++) { builder.append(" bigram=" + mBigrams.get(i).mWord); builder.append(","); builder.append(mBigrams.get(i).mProbabilityInfo.toString()); builder.append("\n"); } for (int i = 0; i < mShortcutTargets.size(); i++) { builder.append(" shortcut=" + mShortcutTargets.get(i).mWord); builder.append(","); builder.append(mShortcutTargets.get(i).mProbabilityInfo.toString()); builder.append("\n"); } return builder.toString(); return CombinedFormatUtils.formatWordProperty(this); } } java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java 0 → 100644 +99 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.android.inputmethod.latin.utils; import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.ProbabilityInfo; import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.util.HashMap; public class CombinedFormatUtils { public static final String DICTIONARY_TAG = "dictionary"; public static final String BIGRAM_TAG = "bigram"; public static final String SHORTCUT_TAG = "shortcut"; public static final String PROBABILITY_TAG = "f"; public static final String HISTORICAL_INFO_TAG = "historicalInfo"; public static final String HISTORICAL_INFO_SEPARATOR = ":"; public static final String WORD_TAG = "word"; public static final String NOT_A_WORD_TAG = "not_a_word"; public static final String BLACKLISTED_TAG = "blacklisted"; public static String formatAttributeMap(final HashMap<String, String> attributeMap) { final StringBuilder builder = new StringBuilder(); builder.append(DICTIONARY_TAG + "="); if (attributeMap.containsKey(DictionaryHeader.DICTIONARY_DESCRIPTION_KEY)) { builder.append(attributeMap.get(DictionaryHeader.DICTIONARY_DESCRIPTION_KEY)); } for (final String key : attributeMap.keySet()) { if (key == DictionaryHeader.DICTIONARY_DESCRIPTION_KEY) { continue; } final String value = attributeMap.get(key); builder.append("," + key + "=" + value); } builder.append("\n"); return builder.toString(); } public static String formatWordProperty(final WordProperty wordProperty) { final StringBuilder builder = new StringBuilder(); builder.append(" " + WORD_TAG + "=" + wordProperty.mWord); builder.append(","); builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo)); if (wordProperty.mIsNotAWord) { builder.append("," + NOT_A_WORD_TAG + "=true"); } if (wordProperty.mIsBlacklistEntry) { builder.append("," + BLACKLISTED_TAG + "=true"); } builder.append("\n"); if (wordProperty.mShortcutTargets != null) { for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { builder.append(" " + SHORTCUT_TAG + "=" + shortcutTarget.mWord); builder.append(","); builder.append(formatProbabilityInfo(shortcutTarget.mProbabilityInfo)); builder.append("\n"); } } if (wordProperty.mBigrams != null) { for (final WeightedString bigram : wordProperty.mBigrams) { builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord); builder.append(","); builder.append(formatProbabilityInfo(bigram.mProbabilityInfo)); builder.append("\n"); } } return builder.toString(); } public static String formatProbabilityInfo(final ProbabilityInfo probabilityInfo) { final StringBuilder builder = new StringBuilder(); builder.append(PROBABILITY_TAG + "=" + probabilityInfo.mProbability); if (probabilityInfo.hasHistoricalInfo()) { builder.append(","); builder.append(HISTORICAL_INFO_TAG + "="); builder.append(probabilityInfo.mTimestamp); builder.append(HISTORICAL_INFO_SEPARATOR); builder.append(probabilityInfo.mLevel); builder.append(HISTORICAL_INFO_SEPARATOR); builder.append(probabilityInfo.mCount); } return builder.toString(); } } tools/dicttool/Android.mk +1 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,7 @@ USED_TARGETTED_UTILS := \ $(LATINIME_CORE_SOURCE_DIRECTORY)/settings/NativeSuggestOptions.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/ByteArrayDictBuffer.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CollectionUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CombinedFormatUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CoordinateUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/FileUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/JniUtils.java \ Loading tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +34 −48 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import java.io.BufferedReader; import java.io.File; Loading @@ -41,16 +42,10 @@ import java.util.TreeSet; * All functions in this class are static. */ public class CombinedInputOutput { private static final String DICTIONARY_TAG = "dictionary"; private static final String BIGRAM_TAG = "bigram"; private static final String SHORTCUT_TAG = "shortcut"; private static final String PROBABILITY_TAG = "f"; private static final String WORD_TAG = "word"; private static final String NOT_A_WORD_TAG = "not_a_word"; private static final String WHITELIST_TAG = "whitelist"; private static final String OPTIONS_TAG = "options"; private static final String COMMENT_LINE_STARTER = "#"; private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; /** * Basic test to find out whether the file is in the combined format or not. Loading @@ -68,7 +63,8 @@ public class CombinedInputOutput { while (firstLine.startsWith(COMMENT_LINE_STARTER)) { firstLine = reader.readLine(); } return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); return firstLine.matches( "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); } catch (FileNotFoundException e) { return false; } catch (IOException e) { Loading Loading @@ -123,7 +119,7 @@ public class CombinedInputOutput { while (null != (line = reader.readLine())) { if (line.startsWith(COMMENT_LINE_STARTER)) continue; final String args[] = line.trim().split(","); if (args[0].matches(WORD_TAG + "=.*")) { if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); for (WeightedString s : bigrams) { Loading @@ -136,23 +132,30 @@ public class CombinedInputOutput { for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (WORD_TAG.equals(params[0])) { if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { word = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { freq = Integer.parseInt(params[1]); } else if (NOT_A_WORD_TAG.equals(params[0])) { } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { final String[] historicalInfoParams = params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { throw new RuntimeException("Wrong format (historical info) : " + line); } // TODO: Use parsed historical info. } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { isNotAWord = "true".equals(params[1]); } } } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { String shortcut = null; int shortcutFreq = 0; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (SHORTCUT_TAG.equals(params[0])) { if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { shortcut = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { shortcutFreq = WHITELIST_TAG.equals(params[1]) ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY : Integer.parseInt(params[1]); Loading @@ -163,16 +166,23 @@ public class CombinedInputOutput { } else { throw new RuntimeException("Wrong format : " + line); } } else if (args[0].matches(BIGRAM_TAG + "=.*")) { } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { String secondWordOfBigram = null; int bigramFreq = 0; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (BIGRAM_TAG.equals(params[0])) { if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { secondWordOfBigram = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { bigramFreq = Integer.parseInt(params[1]); } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { final String[] historicalInfoParams = params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { throw new RuntimeException("Wrong format (historical info) : " + line); } // TODO: Use parsed historical info. } } if (null != secondWordOfBigram) { Loading @@ -198,40 +208,16 @@ public class CombinedInputOutput { * @param destination a destination stream to write to. * @param dict the dictionary to write. */ public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) throws IOException { public static void writeDictionaryCombined( final Writer destination, final FusionDictionary dict) throws IOException { final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<WordProperty>(); for (WordProperty wordProperty: dict) { for (final WordProperty wordProperty : dict) { // This for ordering by frequency, then by asciibetic order wordPropertiesInDict.add(wordProperty); } final HashMap<String, String> options = dict.mOptions.mAttributes; destination.write(DICTIONARY_TAG + "="); if (options.containsKey(DICTIONARY_TAG)) { destination.write(options.get(DICTIONARY_TAG)); options.remove(DICTIONARY_TAG); } for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); destination.write("," + key + "=" + value); } destination.write("\n"); for (WordProperty wordProperty : wordPropertiesInDict) { destination.write(" " + WORD_TAG + "=" + wordProperty.mWord + "," + PROBABILITY_TAG + "=" + wordProperty.getProbability() + (wordProperty.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); if (null != wordProperty.mShortcutTargets) { for (WeightedString target : wordProperty.mShortcutTargets) { destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," + PROBABILITY_TAG + "=" + target.getProbability() + "\n"); } } if (null != wordProperty.mBigrams) { for (WeightedString bigram : wordProperty.mBigrams) { destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," + PROBABILITY_TAG + "=" + bigram.getProbability() + "\n"); } } destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); for (final WordProperty wordProperty : wordPropertiesInDict) { destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); } destination.close(); } Loading Loading
java/src/com/android/inputmethod/latin/makedict/ProbabilityInfo.java +2 −3 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import java.util.Arrays; Loading Loading @@ -57,8 +57,7 @@ public final class ProbabilityInfo { @Override public String toString() { return "f=" + mProbability + (hasHistoricalInfo() ? ",historicalInfo=" + mTimestamp + ":" + mLevel + ":" + mCount : ""); return CombinedFormatUtils.formatProbabilityInfo(this); } @Override Loading
java/src/com/android/inputmethod/latin/makedict/WordProperty.java +4 −29 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.CollectionUtils; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import com.android.inputmethod.latin.utils.StringUtils; import java.util.ArrayList; Loading Loading @@ -52,8 +53,8 @@ public final class WordProperty implements Comparable<WordProperty> { mBigrams = bigrams; mIsNotAWord = isNotAWord; mIsBlacklistEntry = isBlacklistEntry; mHasBigrams = !bigrams.isEmpty(); mHasShortcuts = !shortcutTargets.isEmpty(); mHasBigrams = bigrams != null && !bigrams.isEmpty(); mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty(); } private static ProbabilityInfo createProbabilityInfoFromArray(final int[] probabilityInfo) { Loading Loading @@ -158,32 +159,6 @@ public final class WordProperty implements Comparable<WordProperty> { @Override public String toString() { // TODO: Move this logic to CombinedInputOutput. final StringBuffer builder = new StringBuffer(); builder.append(" word=" + mWord); builder.append(","); builder.append(mProbabilityInfo.toString()); if (mIsNotAWord) { builder.append(","); builder.append("not_a_word=true"); } if (mIsBlacklistEntry) { builder.append(","); builder.append("blacklisted=true"); } builder.append("\n"); for (int i = 0; i < mBigrams.size(); i++) { builder.append(" bigram=" + mBigrams.get(i).mWord); builder.append(","); builder.append(mBigrams.get(i).mProbabilityInfo.toString()); builder.append("\n"); } for (int i = 0; i < mShortcutTargets.size(); i++) { builder.append(" shortcut=" + mShortcutTargets.get(i).mWord); builder.append(","); builder.append(mShortcutTargets.get(i).mProbabilityInfo.toString()); builder.append("\n"); } return builder.toString(); return CombinedFormatUtils.formatWordProperty(this); } }
java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java 0 → 100644 +99 −0 Original line number Diff line number Diff line /* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.android.inputmethod.latin.utils; import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.ProbabilityInfo; import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import java.util.HashMap; public class CombinedFormatUtils { public static final String DICTIONARY_TAG = "dictionary"; public static final String BIGRAM_TAG = "bigram"; public static final String SHORTCUT_TAG = "shortcut"; public static final String PROBABILITY_TAG = "f"; public static final String HISTORICAL_INFO_TAG = "historicalInfo"; public static final String HISTORICAL_INFO_SEPARATOR = ":"; public static final String WORD_TAG = "word"; public static final String NOT_A_WORD_TAG = "not_a_word"; public static final String BLACKLISTED_TAG = "blacklisted"; public static String formatAttributeMap(final HashMap<String, String> attributeMap) { final StringBuilder builder = new StringBuilder(); builder.append(DICTIONARY_TAG + "="); if (attributeMap.containsKey(DictionaryHeader.DICTIONARY_DESCRIPTION_KEY)) { builder.append(attributeMap.get(DictionaryHeader.DICTIONARY_DESCRIPTION_KEY)); } for (final String key : attributeMap.keySet()) { if (key == DictionaryHeader.DICTIONARY_DESCRIPTION_KEY) { continue; } final String value = attributeMap.get(key); builder.append("," + key + "=" + value); } builder.append("\n"); return builder.toString(); } public static String formatWordProperty(final WordProperty wordProperty) { final StringBuilder builder = new StringBuilder(); builder.append(" " + WORD_TAG + "=" + wordProperty.mWord); builder.append(","); builder.append(formatProbabilityInfo(wordProperty.mProbabilityInfo)); if (wordProperty.mIsNotAWord) { builder.append("," + NOT_A_WORD_TAG + "=true"); } if (wordProperty.mIsBlacklistEntry) { builder.append("," + BLACKLISTED_TAG + "=true"); } builder.append("\n"); if (wordProperty.mShortcutTargets != null) { for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { builder.append(" " + SHORTCUT_TAG + "=" + shortcutTarget.mWord); builder.append(","); builder.append(formatProbabilityInfo(shortcutTarget.mProbabilityInfo)); builder.append("\n"); } } if (wordProperty.mBigrams != null) { for (final WeightedString bigram : wordProperty.mBigrams) { builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord); builder.append(","); builder.append(formatProbabilityInfo(bigram.mProbabilityInfo)); builder.append("\n"); } } return builder.toString(); } public static String formatProbabilityInfo(final ProbabilityInfo probabilityInfo) { final StringBuilder builder = new StringBuilder(); builder.append(PROBABILITY_TAG + "=" + probabilityInfo.mProbability); if (probabilityInfo.hasHistoricalInfo()) { builder.append(","); builder.append(HISTORICAL_INFO_TAG + "="); builder.append(probabilityInfo.mTimestamp); builder.append(HISTORICAL_INFO_SEPARATOR); builder.append(probabilityInfo.mLevel); builder.append(HISTORICAL_INFO_SEPARATOR); builder.append(probabilityInfo.mCount); } return builder.toString(); } }
tools/dicttool/Android.mk +1 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,7 @@ USED_TARGETTED_UTILS := \ $(LATINIME_CORE_SOURCE_DIRECTORY)/settings/NativeSuggestOptions.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/ByteArrayDictBuffer.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CollectionUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CombinedFormatUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/CoordinateUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/FileUtils.java \ $(LATINIME_CORE_SOURCE_DIRECTORY)/utils/JniUtils.java \ Loading
tools/dicttool/src/com/android/inputmethod/latin/dicttool/CombinedInputOutput.java +34 −48 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.utils.CombinedFormatUtils; import java.io.BufferedReader; import java.io.File; Loading @@ -41,16 +42,10 @@ import java.util.TreeSet; * All functions in this class are static. */ public class CombinedInputOutput { private static final String DICTIONARY_TAG = "dictionary"; private static final String BIGRAM_TAG = "bigram"; private static final String SHORTCUT_TAG = "shortcut"; private static final String PROBABILITY_TAG = "f"; private static final String WORD_TAG = "word"; private static final String NOT_A_WORD_TAG = "not_a_word"; private static final String WHITELIST_TAG = "whitelist"; private static final String OPTIONS_TAG = "options"; private static final String COMMENT_LINE_STARTER = "#"; private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3; /** * Basic test to find out whether the file is in the combined format or not. Loading @@ -68,7 +63,8 @@ public class CombinedInputOutput { while (firstLine.startsWith(COMMENT_LINE_STARTER)) { firstLine = reader.readLine(); } return firstLine.matches("^" + DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); return firstLine.matches( "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*"); } catch (FileNotFoundException e) { return false; } catch (IOException e) { Loading Loading @@ -123,7 +119,7 @@ public class CombinedInputOutput { while (null != (line = reader.readLine())) { if (line.startsWith(COMMENT_LINE_STARTER)) continue; final String args[] = line.trim().split(","); if (args[0].matches(WORD_TAG + "=.*")) { if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) { if (null != word) { dict.add(word, freq, shortcuts.isEmpty() ? null : shortcuts, isNotAWord); for (WeightedString s : bigrams) { Loading @@ -136,23 +132,30 @@ public class CombinedInputOutput { for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (WORD_TAG.equals(params[0])) { if (CombinedFormatUtils.WORD_TAG.equals(params[0])) { word = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { freq = Integer.parseInt(params[1]); } else if (NOT_A_WORD_TAG.equals(params[0])) { } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { final String[] historicalInfoParams = params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { throw new RuntimeException("Wrong format (historical info) : " + line); } // TODO: Use parsed historical info. } else if (CombinedFormatUtils.NOT_A_WORD_TAG.equals(params[0])) { isNotAWord = "true".equals(params[1]); } } } else if (args[0].matches(SHORTCUT_TAG + "=.*")) { } else if (args[0].matches(CombinedFormatUtils.SHORTCUT_TAG + "=.*")) { String shortcut = null; int shortcutFreq = 0; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (SHORTCUT_TAG.equals(params[0])) { if (CombinedFormatUtils.SHORTCUT_TAG.equals(params[0])) { shortcut = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { shortcutFreq = WHITELIST_TAG.equals(params[1]) ? FormatSpec.SHORTCUT_WHITELIST_FREQUENCY : Integer.parseInt(params[1]); Loading @@ -163,16 +166,23 @@ public class CombinedInputOutput { } else { throw new RuntimeException("Wrong format : " + line); } } else if (args[0].matches(BIGRAM_TAG + "=.*")) { } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) { String secondWordOfBigram = null; int bigramFreq = 0; for (String param : args) { final String params[] = param.split("=", 2); if (2 != params.length) throw new RuntimeException("Wrong format : " + line); if (BIGRAM_TAG.equals(params[0])) { if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) { secondWordOfBigram = params[1]; } else if (PROBABILITY_TAG.equals(params[0])) { } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) { bigramFreq = Integer.parseInt(params[1]); } else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) { final String[] historicalInfoParams = params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR); if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) { throw new RuntimeException("Wrong format (historical info) : " + line); } // TODO: Use parsed historical info. } } if (null != secondWordOfBigram) { Loading @@ -198,40 +208,16 @@ public class CombinedInputOutput { * @param destination a destination stream to write to. * @param dict the dictionary to write. */ public static void writeDictionaryCombined(Writer destination, FusionDictionary dict) throws IOException { public static void writeDictionaryCombined( final Writer destination, final FusionDictionary dict) throws IOException { final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<WordProperty>(); for (WordProperty wordProperty: dict) { for (final WordProperty wordProperty : dict) { // This for ordering by frequency, then by asciibetic order wordPropertiesInDict.add(wordProperty); } final HashMap<String, String> options = dict.mOptions.mAttributes; destination.write(DICTIONARY_TAG + "="); if (options.containsKey(DICTIONARY_TAG)) { destination.write(options.get(DICTIONARY_TAG)); options.remove(DICTIONARY_TAG); } for (final String key : dict.mOptions.mAttributes.keySet()) { final String value = dict.mOptions.mAttributes.get(key); destination.write("," + key + "=" + value); } destination.write("\n"); for (WordProperty wordProperty : wordPropertiesInDict) { destination.write(" " + WORD_TAG + "=" + wordProperty.mWord + "," + PROBABILITY_TAG + "=" + wordProperty.getProbability() + (wordProperty.mIsNotAWord ? "," + NOT_A_WORD_TAG + "=true\n" : "\n")); if (null != wordProperty.mShortcutTargets) { for (WeightedString target : wordProperty.mShortcutTargets) { destination.write(" " + SHORTCUT_TAG + "=" + target.mWord + "," + PROBABILITY_TAG + "=" + target.getProbability() + "\n"); } } if (null != wordProperty.mBigrams) { for (WeightedString bigram : wordProperty.mBigrams) { destination.write(" " + BIGRAM_TAG + "=" + bigram.mWord + "," + PROBABILITY_TAG + "=" + bigram.getProbability() + "\n"); } } destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes)); for (final WordProperty wordProperty : wordPropertiesInDict) { destination.write(CombinedFormatUtils.formatWordProperty(wordProperty)); } destination.close(); } Loading