[mod] Wikipedia: fetch engine traits (data_type: supported_languages) (7daf4f95) · Commits · e / infra / spot

searx/data/engine_traits.json

+220 −2

Original line number	Diff line number	Diff line
		@@ -5121,7 +5121,116 @@
		"all_locale": null,
		"custom": {},
		"data_type": "supported_languages",
		"languages": {},
		"languages": {
		"af": "af",
		"ak": "tw",
		"am": "am",
		"ar": "ar",
		"as": "as",
		"az": "az",
		"be": "be",
		"bg": "bg",
		"bn": "bn",
		"bo": "bo",
		"bs": "bs",
		"ca": "ca",
		"chr": "chr",
		"ckb": "ckb",
		"cs": "cs",
		"da": "da",
		"de": "de",
		"dsb": "dsb",
		"el": "el",
		"en": "en",
		"es": "es",
		"et": "et",
		"fa": "fa",
		"fi": "fi",
		"fil": "tl",
		"fo": "fo",
		"fr": "fr",
		"fur": "fur",
		"fy": "fy",
		"gl": "gl",
		"gsw": "als",
		"gu": "gu",
		"gv": "gv",
		"haw": "haw",
		"he": "he",
		"hi": "hi",
		"hsb": "hsb",
		"hu": "hu",
		"hy": "hy",
		"id": "id",
		"is": "is",
		"it": "it",
		"ja": "ja",
		"jv": "jv",
		"ka": "ka",
		"km": "km",
		"kn": "kn",
		"ko": "ko",
		"ks": "ks",
		"ksh": "ksh",
		"kw": "kw",
		"lb": "lb",
		"lg": "lg",
		"ln": "ln",
		"lo": "lo",
		"lt": "lt",
		"lv": "lv",
		"mai": "mai",
		"mk": "mk",
		"ml": "ml",
		"mn": "mn",
		"mr": "mr",
		"ms": "ms",
		"mt": "mt",
		"nds": "nds-nl",
		"ne": "ne",
		"no": "no",
		"om": "om",
		"or": "or",
		"os": "os",
		"pa": "pa",
		"pl": "pl",
		"ps": "ps",
		"pt": "pt",
		"qu": "qu",
		"rm": "rm",
		"ro": "ro",
		"ru": "ru",
		"rw": "rw",
		"sa": "sa",
		"sah": "sah",
		"sd": "sd",
		"se": "se",
		"shi": "shi",
		"si": "si",
		"sk": "sk",
		"sl": "sl",
		"smn": "smn",
		"so": "so",
		"sq": "sq",
		"sr": "sr",
		"ta": "ta",
		"te": "te",
		"th": "th",
		"tk": "tk",
		"to": "to",
		"tr": "tr",
		"ug": "ug",
		"uk": "uk",
		"ur": "ur",
		"uz": "uz",
		"vi": "vi",
		"wo": "wo",
		"xh": "xh",
		"yi": "yi",
		"zh": "zh",
		"zh_Hans": "zh",
		"zh_Hant": "zh-classical"
		},
		"regions": {},
		"supported_languages": {
		"ab": {
		@@ -6402,7 +6511,116 @@
		"all_locale": null,
		"custom": {},
		"data_type": "supported_languages",
		"languages": {},
		"languages": {
		"af": "af",
		"ak": "tw",
		"am": "am",
		"ar": "ar",
		"as": "as",
		"az": "az",
		"be": "be",
		"bg": "bg",
		"bn": "bn",
		"bo": "bo",
		"bs": "bs",
		"ca": "ca",
		"chr": "chr",
		"ckb": "ckb",
		"cs": "cs",
		"da": "da",
		"de": "de",
		"dsb": "dsb",
		"el": "el",
		"en": "en",
		"es": "es",
		"et": "et",
		"fa": "fa",
		"fi": "fi",
		"fil": "tl",
		"fo": "fo",
		"fr": "fr",
		"fur": "fur",
		"fy": "fy",
		"gl": "gl",
		"gsw": "als",
		"gu": "gu",
		"gv": "gv",
		"haw": "haw",
		"he": "he",
		"hi": "hi",
		"hsb": "hsb",
		"hu": "hu",
		"hy": "hy",
		"id": "id",
		"is": "is",
		"it": "it",
		"ja": "ja",
		"jv": "jv",
		"ka": "ka",
		"km": "km",
		"kn": "kn",
		"ko": "ko",
		"ks": "ks",
		"ksh": "ksh",
		"kw": "kw",
		"lb": "lb",
		"lg": "lg",
		"ln": "ln",
		"lo": "lo",
		"lt": "lt",
		"lv": "lv",
		"mai": "mai",
		"mk": "mk",
		"ml": "ml",
		"mn": "mn",
		"mr": "mr",
		"ms": "ms",
		"mt": "mt",
		"nds": "nds-nl",
		"ne": "ne",
		"no": "no",
		"om": "om",
		"or": "or",
		"os": "os",
		"pa": "pa",
		"pl": "pl",
		"ps": "ps",
		"pt": "pt",
		"qu": "qu",
		"rm": "rm",
		"ro": "ro",
		"ru": "ru",
		"rw": "rw",
		"sa": "sa",
		"sah": "sah",
		"sd": "sd",
		"se": "se",
		"shi": "shi",
		"si": "si",
		"sk": "sk",
		"sl": "sl",
		"smn": "smn",
		"so": "so",
		"sq": "sq",
		"sr": "sr",
		"ta": "ta",
		"te": "te",
		"th": "th",
		"tk": "tk",
		"to": "to",
		"tr": "tr",
		"ug": "ug",
		"uk": "uk",
		"ur": "ur",
		"uz": "uz",
		"vi": "vi",
		"wo": "wo",
		"xh": "xh",
		"yi": "yi",
		"zh": "zh",
		"zh_Hans": "zh",
		"zh_Hant": "zh-classical"
		},
		"regions": {},
		"supported_languages": {
		"ab": {

searx/engines/wikidata.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ from searx.network import post, get
		from searx.utils import match_language, searx_useragent, get_string_replaces_function
		from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
		from searx.engines.wikipedia import ( # pylint: disable=unused-import
		fetch_traits,
		_fetch_supported_languages,
		supported_languages_url,
		)

searx/engines/wikipedia.py

+170 −4

Original line number	Diff line number	Diff line
		@@ -5,9 +5,12 @@

		from urllib.parse import quote
		from json import loads
		from lxml.html import fromstring
		from lxml import html
		from searx.utils import match_language, searx_useragent
		from searx.network import raise_for_httperror
		from searx import network
		from searx.enginelib.traits import EngineTraits

		engine_traits: EngineTraits

		# about
		about = {
		@@ -68,7 +71,7 @@ def response(resp):
		):
		return []

		raise_for_httperror(resp)
		network.raise_for_httperror(resp)

		results = []
		api_result = loads(resp.text)
		@@ -98,7 +101,7 @@ def response(resp):
		# get supported languages from their site
		def _fetch_supported_languages(resp):
		supported_languages = {}
		dom = fromstring(resp.text)
		dom = html.fromstring(resp.text)
		tables = dom.xpath('//table[contains(@class,"sortable")]')
		for table in tables:
		# exclude header row
		@@ -114,3 +117,166 @@ def _fetch_supported_languages(resp):
		supported_languages[code] = {"name": name, "english_name": english_name}

		return supported_languages


		# Nonstandard language codes
		#
		# These Wikipedias use language codes that do not conform to the ISO 639
		# standard (which is how wiki subdomains are chosen nowadays).

		lang_map = {
		'be-tarask': 'bel',
		'ak': 'aka',
		'als': 'gsw',
		'bat-smg': 'sgs',
		'cbk-zam': 'cbk',
		'fiu-vro': 'vro',
		'map-bms': 'map',
		'nrm': 'nrf',
		'roa-rup': 'rup',
		'nds-nl': 'nds',
		#'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
		#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
		'zh-classical': 'zh_Hant',
		'zh-min-nan': 'nan',
		'zh-yue': 'yue',
		'an': 'arg',
		}

		unknown_langs = [
		'ab', # Abkhazian
		'alt', # Southern Altai
		'an', # Aragonese
		'ang', # Anglo-Saxon
		'arc', # Aramaic
		'ary', # Moroccan Arabic
		'av', # Avar
		'ba', # Bashkir
		'be-tarask',
		'bar', # Bavarian
		'bcl', # Central Bicolano
		'bh', # Bhojpuri
		'bi', # Bislama
		'bjn', # Banjar
		'blk', # Pa'O
		'bpy', # Bishnupriya Manipuri
		'bxr', # Buryat
		'cbk-zam', # Zamboanga Chavacano
		'co', # Corsican
		'cu', # Old Church Slavonic
		'dty', # Doteli
		'dv', # Divehi
		'ext', # Extremaduran
		'fj', # Fijian
		'frp', # Franco-Provençal
		'gan', # Gan
		'gom', # Goan Konkani
		'hif', # Fiji Hindi
		'ilo', # Ilokano
		'inh', # Ingush
		'jbo', # Lojban
		'kaa', # Karakalpak
		'kbd', # Kabardian Circassian
		'kg', # Kongo
		'koi', # Komi-Permyak
		'krc', # Karachay-Balkar
		'kv', # Komi
		'lad', # Ladino
		'lbe', # Lak
		'lez', # Lezgian
		'li', # Limburgish
		'ltg', # Latgalian
		'mdf', # Moksha
		'mnw', # Mon
		'mwl', # Mirandese
		'myv', # Erzya
		'na', # Nauruan
		'nah', # Nahuatl
		'nov', # Novial
		'nrm', # Norman
		'pag', # Pangasinan
		'pam', # Kapampangan
		'pap', # Papiamentu
		'pdc', # Pennsylvania German
		'pfl', # Palatinate German
		'roa-rup', # Aromanian
		'sco', # Scots
		'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
		'sh', # Serbo-Croatian
		'simple', # simple english is not know as a natural language different to english (babel)
		'sm', # Samoan
		'srn', # Sranan
		'stq', # Saterland Frisian
		'szy', # Sakizaya
		'tcy', # Tulu
		'tet', # Tetum
		'tpi', # Tok Pisin
		'trv', # Seediq
		'ty', # Tahitian
		'tyv', # Tuvan
		'udm', # Udmurt
		'vep', # Vepsian
		'vls', # West Flemish
		'vo', # Volapük
		'wa', # Walloon
		'xal', # Kalmyk
		]


		def fetch_traits(engine_traits: EngineTraits):
		"""Fetch languages from Wikipedia"""
		# pylint: disable=import-outside-toplevel

		engine_traits.data_type = 'supported_languages' # deprecated

		import babel
		from searx.locales import language_tag

		resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
		if not resp.ok:
		print("ERROR: response from Wikipedia is not OK.")

		dom = html.fromstring(resp.text)
		for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):

		cols = row.xpath('./td')
		if not cols:
		continue

		cols = [c.text_content().strip() for c in cols]
		articles = int(cols[4].replace(',', '').replace('-', '0'))
		users = int(cols[8].replace(',', '').replace('-', '0'))
		depth = cols[11].strip('-')

		if articles < 1000:
		# exclude languages with too few articles
		continue

		# depth: rough indicator of a Wikipedia’s quality, showing how
		# frequently its articles are updated.
		if depth == '':
		if users < 1000:
		# depth is not calculated --> at least 1000 user should registered
		continue
		elif int(depth) < 20:
		continue

		eng_tag = cols[3]

		if eng_tag in unknown_langs:
		continue

		try:
		sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
		except babel.UnknownLocaleError:
		print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
		continue

		conflict = engine_traits.languages.get(sxng_tag)
		if conflict:
		if conflict != eng_tag:
		print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
		continue
		engine_traits.languages[sxng_tag] = eng_tag

		engine_traits.languages['zh_Hans'] = 'zh'