Loading core/java/com/android/internal/util/HanziToPinyin.java +77 −37 Original line number Diff line number Diff line Loading @@ -16,8 +16,6 @@ package com.android.internal.util; import com.google.android.util.AbstractMessageParser.Token; import android.text.TextUtils; import android.util.Log; Loading Loading @@ -298,8 +296,10 @@ public class HanziToPinyin { }; /** First and last Chinese character with known Pinyin according to zh collation */ private static final String FIRST_UNIHAN = "\u5416"; private static final String LAST_UNIHAN = "\u5497"; private static final String FIRST_PINYIN_UNIHAN = "\u5416"; private static final String LAST_PINYIN_UNIHAN = "\u5497"; /** The first Chinese character in Unicode block */ private static final char FIRST_UNIHAN = '\u3400'; private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA); private static HanziToPinyin sInstance; Loading @@ -311,10 +311,18 @@ public class HanziToPinyin { */ public static final String SEPARATOR = " "; public static final int ASCII = 1; public static final int LATIN = 1; public static final int PINYIN = 2; public static final int UNKNOWN = 3; public Token() { } public Token(int type, String source, String target) { this.type = type; this.source = source; this.target = target; } /** * Type of this token, ASCII, PINYIN or UNKNOWN. */ Loading Loading @@ -347,6 +355,7 @@ public class HanziToPinyin { return sInstance; } } Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled"); sInstance = new HanziToPinyin(false); return sInstance; } Loading @@ -359,11 +368,15 @@ public class HanziToPinyin { int offset = -1; int cmp; if (character < 256) { token.type = Token.ASCII; token.type = Token.LATIN; token.target = letter; return token; } else if (character < FIRST_UNIHAN) { token.type = Token.UNKNOWN; token.target = letter; return token; } else { cmp = COLLATOR.compare(letter, FIRST_UNIHAN); cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN); if (cmp < 0) { token.type = Token.UNKNOWN; token.target = letter; Loading @@ -372,7 +385,7 @@ public class HanziToPinyin { token.type = Token.PINYIN; offset = 0; } else { cmp = COLLATOR.compare(letter, LAST_UNIHAN); cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN); if (cmp > 0) { token.type = Token.UNKNOWN; token.target = letter; Loading Loading @@ -412,44 +425,71 @@ public class HanziToPinyin { return token; } /** * Convert the input to a array of tokens. The sequence of ASCII or Unknown * characters without space will be put into a Token, One Hanzi character * which has pinyin will be treated as a Token. * If these is no China collator, the empty token array is returned. */ public ArrayList<Token> get(final String input) { ArrayList<Token> tokens = new ArrayList<Token>(); if (!mHasChinaCollator || TextUtils.isEmpty(input)) { return null; // return empty tokens. return tokens; } ArrayList<Token> tokens = new ArrayList<Token>(); Token currentToken; final int inputLength = input.length(); currentToken = getToken(input.charAt(0)); for (int i = 1; i < inputLength; i++) { final StringBuilder sb = new StringBuilder(); int tokenType = Token.LATIN; // Go through the input, create a new token when // a. Token type changed // b. Get the Pinyin of current charater. // c. current character is space. for (int i = 0; i < inputLength; i++) { final char character = input.charAt(i); Token token = getToken(character); if (token.type != currentToken.type) { currentToken.target = currentToken.target.trim(); tokens.add(currentToken); currentToken = token; if (character == ' ') { if (sb.length() > 0) { addToken(sb, tokens, tokenType); } } else if (character < 256) { if (tokenType != Token.LATIN && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = Token.LATIN; sb.append(character); } else if (character < FIRST_UNIHAN) { if (tokenType != Token.UNKNOWN && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = Token.UNKNOWN; sb.append(character); } else { switch (token.type) { case Token.ASCII: case Token.UNKNOWN: currentToken.source += token.source; currentToken.target += token.target; break; case Token.PINYIN: currentToken.source += token.source; currentToken.target += " " + token.target; break; Token t = getToken(character); if (t.type == Token.PINYIN) { if (sb.length() > 0) { addToken(sb, tokens, tokenType); } tokens.add(t); tokenType = Token.PINYIN; } else { if (tokenType != t.type && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = t.type; sb.append(character); } } } if (sb.length() > 0) { addToken(sb, tokens, tokenType); } currentToken.target = currentToken.target.trim(); tokens.add(currentToken); return tokens; } private void addToken(final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { String str = sb.toString(); tokens.add(new Token(tokenType, str, str)); sb.setLength(0); } } tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java 0 → 100644 +70 −0 Original line number Diff line number Diff line /* * Copyright (C) 2010 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.unit_tests.internal.util; import java.text.Collator; import java.util.ArrayList; import java.util.Locale; import android.test.suitebuilder.annotation.SmallTest; import android.util.Log; import com.android.internal.util.HanziToPinyin; import com.android.internal.util.HanziToPinyin.Token; import junit.framework.TestCase; public class HanziToPinyinTest extends TestCase { private final static String ONE_HANZI = "\u675C"; private final static String TWO_HANZI = "\u675C\u9D51"; private final static String ASSIC = "test"; private final static String ONE_UNKNOWN = "\uFF71"; private final static String MISC = "test\u675C Test with space\uFF71\uFF71\u675C"; @SmallTest public void testGetToken() throws Exception { ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.PINYIN); assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); tokens = HanziToPinyin.getInstance().get(TWO_HANZI); assertEquals(tokens.size(), 2); assertEquals(tokens.get(0).type, Token.PINYIN); assertEquals(tokens.get(1).type, Token.PINYIN); assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN")); tokens = HanziToPinyin.getInstance().get(ASSIC); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.LATIN); tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.UNKNOWN); tokens = HanziToPinyin.getInstance().get(MISC); assertEquals(tokens.size(), 7); assertEquals(tokens.get(0).type, Token.LATIN); assertEquals(tokens.get(1).type, Token.PINYIN); assertEquals(tokens.get(2).type, Token.LATIN); assertEquals(tokens.get(3).type, Token.LATIN); assertEquals(tokens.get(4).type, Token.LATIN); assertEquals(tokens.get(5).type, Token.UNKNOWN); assertEquals(tokens.get(6).type, Token.PINYIN); } } Loading
core/java/com/android/internal/util/HanziToPinyin.java +77 −37 Original line number Diff line number Diff line Loading @@ -16,8 +16,6 @@ package com.android.internal.util; import com.google.android.util.AbstractMessageParser.Token; import android.text.TextUtils; import android.util.Log; Loading Loading @@ -298,8 +296,10 @@ public class HanziToPinyin { }; /** First and last Chinese character with known Pinyin according to zh collation */ private static final String FIRST_UNIHAN = "\u5416"; private static final String LAST_UNIHAN = "\u5497"; private static final String FIRST_PINYIN_UNIHAN = "\u5416"; private static final String LAST_PINYIN_UNIHAN = "\u5497"; /** The first Chinese character in Unicode block */ private static final char FIRST_UNIHAN = '\u3400'; private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA); private static HanziToPinyin sInstance; Loading @@ -311,10 +311,18 @@ public class HanziToPinyin { */ public static final String SEPARATOR = " "; public static final int ASCII = 1; public static final int LATIN = 1; public static final int PINYIN = 2; public static final int UNKNOWN = 3; public Token() { } public Token(int type, String source, String target) { this.type = type; this.source = source; this.target = target; } /** * Type of this token, ASCII, PINYIN or UNKNOWN. */ Loading Loading @@ -347,6 +355,7 @@ public class HanziToPinyin { return sInstance; } } Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled"); sInstance = new HanziToPinyin(false); return sInstance; } Loading @@ -359,11 +368,15 @@ public class HanziToPinyin { int offset = -1; int cmp; if (character < 256) { token.type = Token.ASCII; token.type = Token.LATIN; token.target = letter; return token; } else if (character < FIRST_UNIHAN) { token.type = Token.UNKNOWN; token.target = letter; return token; } else { cmp = COLLATOR.compare(letter, FIRST_UNIHAN); cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN); if (cmp < 0) { token.type = Token.UNKNOWN; token.target = letter; Loading @@ -372,7 +385,7 @@ public class HanziToPinyin { token.type = Token.PINYIN; offset = 0; } else { cmp = COLLATOR.compare(letter, LAST_UNIHAN); cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN); if (cmp > 0) { token.type = Token.UNKNOWN; token.target = letter; Loading Loading @@ -412,44 +425,71 @@ public class HanziToPinyin { return token; } /** * Convert the input to a array of tokens. The sequence of ASCII or Unknown * characters without space will be put into a Token, One Hanzi character * which has pinyin will be treated as a Token. * If these is no China collator, the empty token array is returned. */ public ArrayList<Token> get(final String input) { ArrayList<Token> tokens = new ArrayList<Token>(); if (!mHasChinaCollator || TextUtils.isEmpty(input)) { return null; // return empty tokens. return tokens; } ArrayList<Token> tokens = new ArrayList<Token>(); Token currentToken; final int inputLength = input.length(); currentToken = getToken(input.charAt(0)); for (int i = 1; i < inputLength; i++) { final StringBuilder sb = new StringBuilder(); int tokenType = Token.LATIN; // Go through the input, create a new token when // a. Token type changed // b. Get the Pinyin of current charater. // c. current character is space. for (int i = 0; i < inputLength; i++) { final char character = input.charAt(i); Token token = getToken(character); if (token.type != currentToken.type) { currentToken.target = currentToken.target.trim(); tokens.add(currentToken); currentToken = token; if (character == ' ') { if (sb.length() > 0) { addToken(sb, tokens, tokenType); } } else if (character < 256) { if (tokenType != Token.LATIN && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = Token.LATIN; sb.append(character); } else if (character < FIRST_UNIHAN) { if (tokenType != Token.UNKNOWN && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = Token.UNKNOWN; sb.append(character); } else { switch (token.type) { case Token.ASCII: case Token.UNKNOWN: currentToken.source += token.source; currentToken.target += token.target; break; case Token.PINYIN: currentToken.source += token.source; currentToken.target += " " + token.target; break; Token t = getToken(character); if (t.type == Token.PINYIN) { if (sb.length() > 0) { addToken(sb, tokens, tokenType); } tokens.add(t); tokenType = Token.PINYIN; } else { if (tokenType != t.type && sb.length() > 0) { addToken(sb, tokens, tokenType); } tokenType = t.type; sb.append(character); } } } if (sb.length() > 0) { addToken(sb, tokens, tokenType); } currentToken.target = currentToken.target.trim(); tokens.add(currentToken); return tokens; } private void addToken(final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { String str = sb.toString(); tokens.add(new Token(tokenType, str, str)); sb.setLength(0); } }
tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java 0 → 100644 +70 −0 Original line number Diff line number Diff line /* * Copyright (C) 2010 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.unit_tests.internal.util; import java.text.Collator; import java.util.ArrayList; import java.util.Locale; import android.test.suitebuilder.annotation.SmallTest; import android.util.Log; import com.android.internal.util.HanziToPinyin; import com.android.internal.util.HanziToPinyin.Token; import junit.framework.TestCase; public class HanziToPinyinTest extends TestCase { private final static String ONE_HANZI = "\u675C"; private final static String TWO_HANZI = "\u675C\u9D51"; private final static String ASSIC = "test"; private final static String ONE_UNKNOWN = "\uFF71"; private final static String MISC = "test\u675C Test with space\uFF71\uFF71\u675C"; @SmallTest public void testGetToken() throws Exception { ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.PINYIN); assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); tokens = HanziToPinyin.getInstance().get(TWO_HANZI); assertEquals(tokens.size(), 2); assertEquals(tokens.get(0).type, Token.PINYIN); assertEquals(tokens.get(1).type, Token.PINYIN); assertTrue(tokens.get(0).target.equalsIgnoreCase("DU")); assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN")); tokens = HanziToPinyin.getInstance().get(ASSIC); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.LATIN); tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN); assertEquals(tokens.size(), 1); assertEquals(tokens.get(0).type, Token.UNKNOWN); tokens = HanziToPinyin.getInstance().get(MISC); assertEquals(tokens.size(), 7); assertEquals(tokens.get(0).type, Token.LATIN); assertEquals(tokens.get(1).type, Token.PINYIN); assertEquals(tokens.get(2).type, Token.LATIN); assertEquals(tokens.get(3).type, Token.LATIN); assertEquals(tokens.get(4).type, Token.LATIN); assertEquals(tokens.get(5).type, Token.UNKNOWN); assertEquals(tokens.get(6).type, Token.PINYIN); } }