Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit caa15518 authored by Shimeng (Simon) Wang's avatar Shimeng (Simon) Wang Committed by Android (Google) Code Review
Browse files

Merge "Enhance URL regular expression to match more Unicode chars."

parents 8b77bd86 3207e29d
Loading
Loading
Loading
Loading
+37 −14
Original line number Diff line number Diff line
@@ -24,12 +24,12 @@ import java.util.regex.Pattern;
 */
public class Patterns {
    /**
     *  Regular expression pattern to match all IANA top-level domains.
     *  Regular expression to match all IANA top-level domains.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
    public static final String TOP_LEVEL_DOMAIN_STR =
        "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
        + "|(biz|b[abdefghijmnorstvwyz])"
        + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -55,20 +55,22 @@ public class Patterns {
        + "|w[fs]"
        + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
        + "|y[etu]"
        + "|z[amw])");
        + "|z[amw])";

    /**
     *  Regular expression pattern to match all IANA top-level domains.
     */
    public static final Pattern TOP_LEVEL_DOMAIN =
        Pattern.compile(TOP_LEVEL_DOMAIN_STR);

    /**
     *  Regular expression pattern to match RFC 1738 URLs
     *  Regular expression to match all IANA top-level domains for WEB_URL.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by development/tools/make-iana-tld-pattern.py
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern WEB_URL = Pattern.compile(
        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
        + "(?:"   // plus top level domain
    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
        "(?:"
        + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
        + "|(?:biz|b[abdefghijmnorstvwyz])"
        + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
@@ -94,7 +96,28 @@ public class Patterns {
        + "|w[fs]"
        + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)"
        + "|y[etu]"
        + "|z[amw]))"
        + "|z[amw]))";

    /**
     * Good characters for Internationalized Resource Identifiers (IRI).
     * This comprises most common used Unicode characters allowed in IRI
     * as detailed in RFC 3987.
     * Specifically, those two byte Unicode characters are not included.
     */
    public static final String GOOD_IRI_CHAR =
        "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";

    /**
     *  Regular expression pattern to match most part of RFC 3987
     *  Internationalized URLs, aka IRIs.  Commonly used Unicode characters are
     *  added.
     */
    public static final Pattern WEB_URL = Pattern.compile(
        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
        + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+"   // named host
        + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
@@ -116,7 +139,7 @@ public class Patterns {

    public static final Pattern DOMAIN_NAME
        = Pattern.compile(
            "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+"
            "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
            + TOP_LEVEL_DOMAIN + ")|"
            + IP_ADDRESS + ")");

+13 −0
Original line number Diff line number Diff line
@@ -68,6 +68,12 @@ public class PatternsTest extends TestCase {
        t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches();
        assertTrue("Valid URL", t);

        // Internationalized URL.
        t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
        assertTrue("Valid URL", t);
        t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
        assertTrue("Valid URL", t);

        t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches();
        assertFalse("Matched invalid protocol", t);

@@ -99,6 +105,13 @@ public class PatternsTest extends TestCase {
        t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches();
        assertTrue("Valid domain", t);

        t = Patterns.WEB_URL.matcher("google.me").matches();
        assertTrue("Valid domain", t);

        // Internationalized domains.
        t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches();
        assertTrue("Valid domain", t);

        t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches();
        assertFalse("Invalid domain", t);
    }
+8 −24
Original line number Diff line number Diff line
@@ -4,43 +4,27 @@ from urllib2 import urlopen

TLD_PREFIX = r"""
    /**
     *  Regular expression pattern to match all IANA top-level domains.
     *  Regular expression to match all IANA top-level domains.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
    public static final String TOP_LEVEL_DOMAIN_STR =
"""
TLD_SUFFIX = '");'
TLD_SUFFIX = '";'

URL_PREFIX = r"""
    /**
     *  Regular expression pattern to match RFC 1738 URLs
     *  Regular expression to match all IANA top-level domains for WEB_URL.
     *  List accurate as of 2010/02/05.  List taken from:
     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     *  This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
     *  This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
     */
    public static final Pattern WEB_URL = Pattern.compile(
        "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
        + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
        + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
        + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+"   // named host
        + "(?:"   // plus top level domain
    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
        "(?:"
"""

URL_SUFFIX = r"""
        + "|(?:(?:25[0-5]|2[0-4]" // or ip address
        + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
        + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
        + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
        + "|[1-9][0-9]|[0-9])))"
        + "(?:\\:\\d{1,5})?)" // plus option port number
        + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
        + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
        + "(?:\\b|$)"); // and finally, a word boundary or end of
                        // input.  This is to stop foo.sure from
                        // matching as foo.su
"""
URL_SUFFIX = ';'

class Bucket:
    def __init__(self, baseLetter):