Merge "Add back lost python script." (6e00aae5) · Commits · e / os / android_frameworks_native-old

common/tools/make-iana-tld-pattern.py

0 → 100755

+160 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python

		from urllib2 import urlopen

		TLD_PREFIX = r"""
		/**
		* Regular expression pattern to match all IANA top-level domains.
		* List accurate as of 2010/02/05. List taken from:
		* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
		* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
		*/
		public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
		"""
		TLD_SUFFIX = '");'

		URL_PREFIX = r"""
		/**
		* Regular expression pattern to match RFC 1738 URLs
		* List accurate as of 2010/02/05. List taken from:
		* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
		* This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py
		*/
		public static final Pattern WEB_URL = Pattern.compile(
		"((?:(http\|https\|Http\|Https\|rtsp\|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\$\$"
		+ "\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
		+ "\\.\\+\\!\\*\\'\$\$\\,\\;\\?\\&\\=]\|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
		+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
		+ "(?:" // plus top level domain
		"""

		URL_SUFFIX = r"""
		+ "\|(?:(?:25[0-5]\|2[0-4]" // or ip address
		+ "[0-9]\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9])\\.(?:25[0-5]\|2[0-4][0-9]"
		+ "\|[0-1][0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1]"
		+ "[0-9]{2}\|[1-9][0-9]\|[1-9]\|0)\\.(?:25[0-5]\|2[0-4][0-9]\|[0-1][0-9]{2}"
		+ "\|[1-9][0-9]\|[0-9])))"
		+ "(?:\\:\\d{1,5})?)" // plus option port number
		+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
		+ "\\-\\.\\+\\!\\\\'\$\$\\,\\_])\|(?:\\%[a-fA-F0-9]{2})))?"
		+ "(?:\\b\|$)"); // and finally, a word boundary or end of
		// input. This is to stop foo.sure from
		// matching as foo.su
		"""

		class Bucket:
		def __init__(self, baseLetter):
		self.base=baseLetter
		self.words=[]
		self.letters=[]

		def dump(self, isWebUrl=False, isFirst=False, isLast=False):
		if (len(self.words) == 0) and (len(self.letters) == 0):
		return ''

		self.words.sort()
		self.letters.sort()

		output = ' ';

		if isFirst:
		if isWebUrl:
		output += '+ "'
		else:
		output += '"('
		else:
		output += '+ "\|'

		if len(self.words) != 0:
		output += '('

		if isWebUrl:
		output += '?:'

		firstWord = 1
		for word in self.words:
		if firstWord == 0:
		output += '\|'
		firstWord = 0
		for letter in word:
		if letter == '-':
		output += '\\\\' # escape the '-' character.
		output += letter

		if len(self.words) > 0 and len(self.letters) > 0:
		output += '\|'

		if len(self.letters) == 1:
		output += '%c%c' % (self.base, self.letters[0])
		elif len(self.letters) > 0:
		output += '%c[' % self.base

		for letter in self.letters:
		output += letter

		output += ']'

		if len(self.words) != 0:
		output += ')'

		if not isLast:
		output += '"'
		output += '\n'

		return output;

		def add(self, line):
		length = len(line)

		if line.startswith('#') or (length == 0):
		return;

		if length == 2:
		self.letters.append(line[1:2])
		else:
		self.words.append(line)

		def getBucket(buckets, line):
		letter = line[0]
		bucket = buckets.get(letter)

		if bucket is None:
		bucket = Bucket(letter)
		buckets[letter] = bucket

		return bucket

		def makePattern(prefix, suffix, buckets, isWebUrl=False):
		output = prefix

		output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)

		for letter in range(ord('b'), ord('z')):
		output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)

		output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)

		if isWebUrl:
		output += '))"'
		else:
		output += ')'

		output += suffix

		print output

		if __name__ == "__main__":
		f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
		domains = f.readlines()
		f.close()

		buckets = {}

		for domain in domains:
		domain = domain.lower()

		if len(domain) > 0:
		getBucket(buckets, domain[0]).add(domain.strip())

		makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
		makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)