Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5cb9b70a authored by Jean Chalard's avatar Jean Chalard Committed by Android (Google) Code Review
Browse files

Merge "Fix abbreviations processing for English (A7)" into jb-mr1-dev

parents a3f06baa 2c0d9162
Loading
Loading
Loading
Loading
+81 −26
Original line number Diff line number Diff line
@@ -304,34 +304,89 @@ public final class StringUtils {
        }

        if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
        char c = cs.charAt(j - 1);
        if (c == Keyboard.CODE_PERIOD || c == Keyboard.CODE_QUESTION_MARK
                || c == Keyboard.CODE_EXCLAMATION_MARK) {
            // Here we found a marker for sentence end (we consider these to be one of
            // either . or ? or ! only). So this is probably the end of a sentence, but if we
            // found a period, we still want to check the case where this is a abbreviation
            // period rather than a full stop. To do this, we look for a period within a word
            // before the period we just found; if any, we take that to mean it was an
            // abbreviation.
            // A typical example of the above is "In the U.S. ", where the last period is
            // not a full stop and we should not capitalize.
            // TODO: the rule below is broken. In particular it fails for runs of periods,
            // whatever the reason. In the example "in the U.S..", the last period is a full
            // stop following the abbreviation period, and we should capitalize but we don't.
            // Likewise, "I don't know... " should capitalize, but fails to do so.
            if (c == Keyboard.CODE_PERIOD) {
                for (int k = j - 2; k >= 0; k--) {
                    c = cs.charAt(k);
                    if (c == Keyboard.CODE_PERIOD) {
                        return TextUtils.CAP_MODE_CHARACTERS & reqModes;
        char c = cs.charAt(--j);

        // We found the next interesting chunk of text ; next we need to determine if it's the
        // end of a sentence. If we have a question mark or an exclamation mark, it's the end of
        // a sentence. If it's neither, the only remaining case is the period so we get the opposite
        // case out of the way.
        if (c == Keyboard.CODE_QUESTION_MARK || c == Keyboard.CODE_EXCLAMATION_MARK) {
            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
        }
        if (c != Keyboard.CODE_PERIOD || j <= 0) {
            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
        }

        // We found out that we have a period. We need to determine if this is a full stop or
        // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
        // looks like (\w\.){2,}
        // To find out, we will have a simple state machine with the following states :
        // START, WORD, PERIOD, ABBREVIATION
        // On START : (just before the first period)
        //           letter => WORD
        //           whitespace => end with no caps (it was a stand-alone period)
        //           otherwise => end with caps (several periods/symbols in a row)
        // On WORD : (within the word just before the first period)
        //           letter => WORD
        //           period => PERIOD
        //           otherwise => end with caps (it was a word with a full stop at the end)
        // On PERIOD : (period within a potential abbreviation)
        //           letter => LETTER
        //           otherwise => end with caps (it was not an abbreviation)
        // On LETTER : (letter within a potential abbreviation)
        //           letter => LETTER
        //           period => PERIOD
        //           otherwise => end with no caps (it was an abbreviation)
        // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
        // should capitalize.

        final int START = 0;
        final int WORD = 1;
        final int PERIOD = 2;
        final int LETTER = 3;
        final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
                | TextUtils.CAP_MODE_SENTENCES) & reqModes;
        final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
        int state = START;
        while (j > 0) {
            c = cs.charAt(--j);
            switch (state) {
            case START:
                if (Character.isLetter(c)) {
                    state = WORD;
                } else if (Character.isWhitespace(c)) {
                    return noCaps;
                } else {
                    return caps;
                }
                    if (!Character.isLetter(c)) {
                break;
            case WORD:
                if (Character.isLetter(c)) {
                    state = WORD;
                } else if (c == Keyboard.CODE_PERIOD) {
                    state = PERIOD;
                } else {
                    return caps;
                }
                break;
            case PERIOD:
                if (Character.isLetter(c)) {
                    state = LETTER;
                } else {
                    return caps;
                }
                break;
            case LETTER:
                if (Character.isLetter(c)) {
                    state = LETTER;
                } else if (c == Keyboard.CODE_PERIOD) {
                    state = PERIOD;
                } else {
                    return noCaps;
                }
            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
            }
        return TextUtils.CAP_MODE_CHARACTERS & reqModes;
        }
        // Here we arrived at the start of the line. This should behave exactly like whitespace.
        return (START == state || LETTER == state) ? noCaps : caps;
    }
}
+41 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
package com.android.inputmethod.latin;

import android.test.AndroidTestCase;
import android.text.TextUtils;

public class StringUtilsTests extends AndroidTestCase {
    public void testContainsInArray() {
@@ -99,4 +100,44 @@ public class StringUtilsTests extends AndroidTestCase {
        assertFalse("lower-case string", StringUtils.hasUpperCase("string"));
        assertFalse("lower-case string with non-letters", StringUtils.hasUpperCase("he's"));
    }

    private void onePathForCaps(final CharSequence cs, final int expectedResult, final int mask) {
        int oneTimeResult = expectedResult & mask;
        assertEquals("After >" + cs + "<", oneTimeResult, StringUtils.getCapsMode(cs, mask));
    }

    private void allPathsForCaps(final CharSequence cs, final int expectedResult) {
        final int c = TextUtils.CAP_MODE_CHARACTERS;
        final int w = TextUtils.CAP_MODE_WORDS;
        final int s = TextUtils.CAP_MODE_SENTENCES;
        onePathForCaps(cs, expectedResult, c | w | s);
        onePathForCaps(cs, expectedResult, w | s);
        onePathForCaps(cs, expectedResult, c | s);
        onePathForCaps(cs, expectedResult, c | w);
        onePathForCaps(cs, expectedResult, c);
        onePathForCaps(cs, expectedResult, w);
        onePathForCaps(cs, expectedResult, s);
    }

    public void testGetCapsMode() {
        final int c = TextUtils.CAP_MODE_CHARACTERS;
        final int w = TextUtils.CAP_MODE_WORDS;
        final int s = TextUtils.CAP_MODE_SENTENCES;
        allPathsForCaps("", c | w | s);
        allPathsForCaps("Word", c);
        allPathsForCaps("Word.", c);
        allPathsForCaps("Word ", c | w);
        allPathsForCaps("Word. ", c | w | s);
        allPathsForCaps("Word..", c);
        allPathsForCaps("Word.. ", c | w | s);
        allPathsForCaps("Word... ", c | w | s);
        allPathsForCaps("Word ... ", c | w | s);
        allPathsForCaps("Word . ", c | w);
        allPathsForCaps("In the U.S ", c | w);
        allPathsForCaps("In the U.S. ", c | w);
        allPathsForCaps("Some stuff (e.g. ", c | w);
        allPathsForCaps("In the U.S.. ", c | w | s);
        allPathsForCaps("\"Word.\" ", c | w | s);
        allPathsForCaps("\"Word\" ", c | w);
    }
}