Loading common/java/com/android/common/Patterns.java +37 −14 Original line number Diff line number Diff line Loading @@ -24,12 +24,12 @@ import java.util.regex.Pattern; */ public class Patterns { /** * Regular expression pattern to match all IANA top-level domains. * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( public static final String TOP_LEVEL_DOMAIN_STR = "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(biz|b[abdefghijmnorstvwyz])" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" Loading @@ -55,20 +55,22 @@ public class Patterns { + "|w[fs]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" + "|z[amw])"); + "|z[amw])"; /** * Regular expression pattern to match all IANA top-level domains. */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(TOP_LEVEL_DOMAIN_STR); /** * Regular expression pattern to match RFC 1738 URLs * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" Loading @@ -94,7 +96,28 @@ public class Patterns { + "|w[fs]" + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" + "|z[amw]))" + "|z[amw]))"; /** * Good characters for Internationalized Resource Identifiers (IRI). * This comprises most common used Unicode characters allowed in IRI * as detailed in RFC 3987. * Specifically, those two byte Unicode characters are not included. */ public static final String GOOD_IRI_CHAR = "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; /** * Regular expression pattern to match most part of RFC 3987 * Internationalized URLs, aka IRIs. Commonly used Unicode characters are * added. */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" Loading @@ -116,7 +139,7 @@ public class Patterns { public static final Pattern DOMAIN_NAME = Pattern.compile( "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN + ")|" + IP_ADDRESS + ")"); Loading common/tests/src/com/android/common/PatternsTest.java +13 −0 Original line number Diff line number Diff line Loading @@ -68,6 +68,12 @@ public class PatternsTest extends TestCase { t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches(); assertTrue("Valid URL", t); // Internationalized URL. t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid URL", t); t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid URL", t); t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches(); assertFalse("Matched invalid protocol", t); Loading Loading @@ -99,6 +105,13 @@ public class PatternsTest extends TestCase { t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches(); assertTrue("Valid domain", t); t = Patterns.WEB_URL.matcher("google.me").matches(); assertTrue("Valid domain", t); // Internationalized domains. t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid domain", t); t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches(); assertFalse("Invalid domain", t); } Loading common/tools/make-iana-tld-pattern.py +8 −24 Original line number Diff line number Diff line Loading @@ -4,43 +4,27 @@ from urllib2 import urlopen TLD_PREFIX = r""" /** * Regular expression pattern to match all IANA top-level domains. * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( public static final String TOP_LEVEL_DOMAIN_STR = """ TLD_SUFFIX = '");' TLD_SUFFIX = '";' URL_PREFIX = r""" /** * Regular expression pattern to match RFC 1738 URLs * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" """ URL_SUFFIX = r""" + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9])))" + "(?:\\:\\d{1,5})?)" // plus option port number + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + "(?:\\b|$)"); // and finally, a word boundary or end of // input. This is to stop foo.sure from // matching as foo.su """ URL_SUFFIX = ';' class Bucket: def __init__(self, baseLetter): Loading Loading
common/java/com/android/common/Patterns.java +37 −14 Original line number Diff line number Diff line Loading @@ -24,12 +24,12 @@ import java.util.regex.Pattern; */ public class Patterns { /** * Regular expression pattern to match all IANA top-level domains. * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( public static final String TOP_LEVEL_DOMAIN_STR = "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(biz|b[abdefghijmnorstvwyz])" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" Loading @@ -55,20 +55,22 @@ public class Patterns { + "|w[fs]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" + "|z[amw])"); + "|z[amw])"; /** * Regular expression pattern to match all IANA top-level domains. */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(TOP_LEVEL_DOMAIN_STR); /** * Regular expression pattern to match RFC 1738 URLs * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by development/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(?:biz|b[abdefghijmnorstvwyz])" + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" Loading @@ -94,7 +96,28 @@ public class Patterns { + "|w[fs]" + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-zckzah)" + "|y[etu]" + "|z[amw]))" + "|z[amw]))"; /** * Good characters for Internationalized Resource Identifiers (IRI). * This comprises most common used Unicode characters allowed in IRI * as detailed in RFC 3987. * Specifically, those two byte Unicode characters are not included. */ public static final String GOOD_IRI_CHAR = "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; /** * Regular expression pattern to match most part of RFC 3987 * Internationalized URLs, aka IRIs. Commonly used Unicode characters are * added. */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" Loading @@ -116,7 +139,7 @@ public class Patterns { public static final Pattern DOMAIN_NAME = Pattern.compile( "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN + ")|" + IP_ADDRESS + ")"); Loading
common/tests/src/com/android/common/PatternsTest.java +13 −0 Original line number Diff line number Diff line Loading @@ -68,6 +68,12 @@ public class PatternsTest extends TestCase { t = Patterns.WEB_URL.matcher("xn--fsqu00a.xn--0zwm56d").matches(); assertTrue("Valid URL", t); // Internationalized URL. t = Patterns.WEB_URL.matcher("http://\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid URL", t); t = Patterns.WEB_URL.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid URL", t); t = Patterns.WEB_URL.matcher("ftp://www.example.com").matches(); assertFalse("Matched invalid protocol", t); Loading Loading @@ -99,6 +105,13 @@ public class PatternsTest extends TestCase { t = Patterns.DOMAIN_NAME.matcher("mail.example.com").matches(); assertTrue("Valid domain", t); t = Patterns.WEB_URL.matcher("google.me").matches(); assertTrue("Valid domain", t); // Internationalized domains. t = Patterns.DOMAIN_NAME.matcher("\uD604\uAE08\uC601\uC218\uC99D.kr").matches(); assertTrue("Valid domain", t); t = Patterns.DOMAIN_NAME.matcher("__+&42.xer").matches(); assertFalse("Invalid domain", t); } Loading
common/tools/make-iana-tld-pattern.py +8 −24 Original line number Diff line number Diff line Loading @@ -4,43 +4,27 @@ from urllib2 import urlopen TLD_PREFIX = r""" /** * Regular expression pattern to match all IANA top-level domains. * Regular expression to match all IANA top-level domains. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( public static final String TOP_LEVEL_DOMAIN_STR = """ TLD_SUFFIX = '");' TLD_SUFFIX = '";' URL_PREFIX = r""" /** * Regular expression pattern to match RFC 1738 URLs * Regular expression to match all IANA top-level domains for WEB_URL. * List accurate as of 2010/02/05. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ public static final Pattern WEB_URL = Pattern.compile( "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "(?:" // plus top level domain public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = "(?:" """ URL_SUFFIX = r""" + "|(?:(?:25[0-5]|2[0-4]" // or ip address + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + "|[1-9][0-9]|[0-9])))" + "(?:\\:\\d{1,5})?)" // plus option port number + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + "(?:\\b|$)"); // and finally, a word boundary or end of // input. This is to stop foo.sure from // matching as foo.su """ URL_SUFFIX = ';' class Bucket: def __init__(self, baseLetter): Loading