Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 42565866 authored by Bai Tao's avatar Bai Tao
Browse files

Modify the interface of HanziToPinyin class to make it generic and add test class

parent 52a01449
Loading
Loading
Loading
Loading
+77 −37
Original line number Diff line number Diff line
@@ -16,8 +16,6 @@

package com.android.internal.util;

import com.google.android.util.AbstractMessageParser.Token;

import android.text.TextUtils;
import android.util.Log;

@@ -298,8 +296,10 @@ public class HanziToPinyin {
        };

    /** First and last Chinese character with known Pinyin according to zh collation */
    private static final String FIRST_UNIHAN =  "\u5416";
    private static final String LAST_UNIHAN =  "\u5497";
    private static final String FIRST_PINYIN_UNIHAN =  "\u5416";
    private static final String LAST_PINYIN_UNIHAN =  "\u5497";
    /** The first Chinese character in Unicode block */
    private static final char FIRST_UNIHAN = '\u3400';
    private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA);

    private static HanziToPinyin sInstance;
@@ -311,10 +311,18 @@ public class HanziToPinyin {
         */
        public static final String SEPARATOR = " ";

        public static final int ASCII = 1;
        public static final int LATIN = 1;
        public static final int PINYIN = 2;
        public static final int UNKNOWN = 3;

        public Token() {
        }

        public Token(int type, String source, String target) {
            this.type = type;
            this.source = source;
            this.target = target;
        }
        /**
         * Type of this token, ASCII, PINYIN or UNKNOWN.
         */
@@ -347,6 +355,7 @@ public class HanziToPinyin {
                    return sInstance;
                }
            }
            Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled");
            sInstance = new HanziToPinyin(false);
            return sInstance;
        }
@@ -359,11 +368,15 @@ public class HanziToPinyin {
        int offset = -1;
        int cmp;
        if (character < 256) {
            token.type = Token.ASCII;
            token.type = Token.LATIN;
            token.target = letter;
            return token;
        } else if (character < FIRST_UNIHAN) {
            token.type = Token.UNKNOWN;
            token.target = letter;
            return token;
        } else {
            cmp = COLLATOR.compare(letter, FIRST_UNIHAN);
            cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN);
            if (cmp < 0) {
                token.type = Token.UNKNOWN;
                token.target = letter;
@@ -372,7 +385,7 @@ public class HanziToPinyin {
                token.type = Token.PINYIN;
                offset = 0;
            } else {
                cmp = COLLATOR.compare(letter, LAST_UNIHAN);
                cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN);
                if (cmp > 0) {
                    token.type = Token.UNKNOWN;
                    token.target = letter;
@@ -412,44 +425,71 @@ public class HanziToPinyin {
        return token;
    }

    /**
     * Convert the input to a array of tokens. The sequence of ASCII or Unknown
     * characters without space will be put into a Token, One Hanzi character 
     * which has pinyin will be treated as a Token.
     * If these is no China collator, the empty token array is returned.
     */
    public ArrayList<Token> get(final String input) {
        ArrayList<Token> tokens = new ArrayList<Token>();
        if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
            return null;
            // return empty tokens.
            return tokens;
        }

        ArrayList<Token> tokens = new ArrayList<Token>();
        Token currentToken;

        final int inputLength = input.length();

        currentToken = getToken(input.charAt(0));

        for (int i = 1; i < inputLength; i++) {
        final StringBuilder sb = new StringBuilder();
        int tokenType = Token.LATIN;
        // Go through the input, create a new token when
        // a. Token type changed
        // b. Get the Pinyin of current charater.
        // c. current character is space.
        for (int i = 0; i < inputLength; i++) {
            final char character = input.charAt(i);
            Token token = getToken(character);

            if (token.type != currentToken.type) {
                currentToken.target = currentToken.target.trim();
                tokens.add(currentToken);
                currentToken = token;
            if (character == ' ') {
                if (sb.length() > 0) {
                    addToken(sb, tokens, tokenType);
                }
            } else if (character < 256) {
                if (tokenType != Token.LATIN && sb.length() > 0) {
                    addToken(sb, tokens, tokenType);
                }
                tokenType = Token.LATIN;
                sb.append(character);
            } else if (character < FIRST_UNIHAN) {
                if (tokenType != Token.UNKNOWN && sb.length() > 0) {
                    addToken(sb, tokens, tokenType);
                }
                tokenType = Token.UNKNOWN;
                sb.append(character);
            } else {
                switch (token.type) {
                    case Token.ASCII:
                    case Token.UNKNOWN:
                        currentToken.source += token.source;
                        currentToken.target += token.target;
                        break;
                    case Token.PINYIN:
                        currentToken.source += token.source;
                        currentToken.target += " " + token.target;
                        break;
                Token t = getToken(character);
                if (t.type == Token.PINYIN) {
                    if (sb.length() > 0) {
                        addToken(sb, tokens, tokenType);
                    }
                    tokens.add(t);
                    tokenType = Token.PINYIN;
                } else {
                    if (tokenType != t.type && sb.length() > 0) {
                        addToken(sb, tokens, tokenType);
                    }
                    tokenType = t.type;
                    sb.append(character);
                }
            }
        }
        if (sb.length() > 0) {
            addToken(sb, tokens, tokenType);
        }

        currentToken.target = currentToken.target.trim();
        tokens.add(currentToken);

        return tokens;
    }

    private void addToken(final StringBuilder sb, final ArrayList<Token> tokens,
            final int tokenType) {
        String str = sb.toString();
        tokens.add(new Token(tokenType, str, str));
        sb.setLength(0);
    }

}
+70 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2010 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.unit_tests.internal.util;

import java.text.Collator;
import java.util.ArrayList;
import java.util.Locale;

import android.test.suitebuilder.annotation.SmallTest;
import android.util.Log;

import com.android.internal.util.HanziToPinyin;
import com.android.internal.util.HanziToPinyin.Token;

import junit.framework.TestCase;

public class HanziToPinyinTest extends TestCase {
    private final static String ONE_HANZI = "\u675C";
    private final static String TWO_HANZI = "\u675C\u9D51";
    private final static String ASSIC = "test";
    private final static String ONE_UNKNOWN = "\uFF71";
    private final static String MISC = "test\u675C   Test with space\uFF71\uFF71\u675C";

    @SmallTest
    public void testGetToken() throws Exception {
        ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI);
        assertEquals(tokens.size(), 1);
        assertEquals(tokens.get(0).type, Token.PINYIN);
        assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));

        tokens = HanziToPinyin.getInstance().get(TWO_HANZI);
        assertEquals(tokens.size(), 2);
        assertEquals(tokens.get(0).type, Token.PINYIN);
        assertEquals(tokens.get(1).type, Token.PINYIN);
        assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));
        assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN"));

        tokens = HanziToPinyin.getInstance().get(ASSIC);
        assertEquals(tokens.size(), 1);
        assertEquals(tokens.get(0).type, Token.LATIN);

        tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN);
        assertEquals(tokens.size(), 1);
        assertEquals(tokens.get(0).type, Token.UNKNOWN);

        tokens = HanziToPinyin.getInstance().get(MISC);
        assertEquals(tokens.size(), 7);
        assertEquals(tokens.get(0).type, Token.LATIN);
        assertEquals(tokens.get(1).type, Token.PINYIN);
        assertEquals(tokens.get(2).type, Token.LATIN);
        assertEquals(tokens.get(3).type, Token.LATIN);
        assertEquals(tokens.get(4).type, Token.LATIN);
        assertEquals(tokens.get(5).type, Token.UNKNOWN);
        assertEquals(tokens.get(6).type, Token.PINYIN);
    }
}