[mod] wikipedia & wikidata: upgrade to data_type: traits_v1 (858aa3e6) · Commits · e / infra / spot

docs/src/searx.engines.wikipedia.rst

0 → 100644

+27 −0

Original line number	Diff line number	Diff line
		.. _wikimedia engines:

		=========
		Wikimedia
		=========

		.. contents:: Contents
		:depth: 2
		:local:
		:backlinks: entry


		.. _wikipedia engine:

		Wikipedia
		=========

		.. automodule:: searx.engines.wikipedia
		:members:

		.. _wikidata engine:

		Wikidata
		=========

		.. automodule:: searx.engines.wikidata
		:members:

searx/autocomplete.py

+24 −7

Original line number	Diff line number	Diff line
		@@ -143,14 +143,31 @@ def qwant(query, sxng_locale):
		return results


		def wikipedia(query, lang):
		# wikipedia autocompleter
		url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
		def wikipedia(query, sxng_locale):
		"""Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
		results = []
		eng_traits = engines['wikipedia'].traits
		wiki_lang = eng_traits.get_language(sxng_locale, 'en')
		wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')

		url = 'https://{wiki_netloc}/w/api.php?{args}'
		args = urlencode(
		{
		'action': 'opensearch',
		'format': 'json',
		'formatversion': '2',
		'search': query,
		'namespace': '0',
		'limit': '10',
		}
		)
		resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
		if resp.ok:
		data = resp.json()
		if len(data) > 1:
		results = data[1]

		resp = loads(get(url.format(urlencode(dict(search=query)))).text)
		if len(resp) > 1:
		return resp[1]
		return []
		return results


		def yandex(query, _lang):

searx/data/engine_traits.json

+175 −2696

File changed.

Preview size limit exceeded, changes collapsed.

searx/engines/wikidata.py

+36 −15

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: AGPL-3.0-or-later
		# lint: pylint
		"""Wikidata
		"""This module implements the Wikidata engine. Some implementations are shared
		from :ref:`wikipedia engine`.

		"""
		# pylint: disable=missing-class-docstring

		from typing import TYPE_CHECKING
		from hashlib import md5
		from urllib.parse import urlencode, unquote
		from json import loads
		@@ -13,13 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_

		from searx.data import WIKIDATA_UNITS
		from searx.network import post, get
		from searx.utils import match_language, searx_useragent, get_string_replaces_function
		from searx.utils import searx_useragent, get_string_replaces_function
		from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
		from searx.engines.wikipedia import ( # pylint: disable=unused-import
		fetch_traits,
		_fetch_supported_languages,
		supported_languages_url,
		)
		from searx.engines.wikipedia import fetch_traits as _fetch_traits
		from searx.enginelib.traits import EngineTraits

		if TYPE_CHECKING:
		import logging

		logger: logging.Logger

		traits: EngineTraits

		# about
		about = {
		@@ -155,33 +162,35 @@ def send_wikidata_query(query, method='GET'):


		def request(query, params):
		language = params['language'].split('-')[0]
		if language == 'all':
		language = 'en'
		else:
		language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]

		# wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
		# mapped to zh
		sxng_lang = params['searxng_locale'].split('-')[0]
		language = traits.get_language(sxng_lang, 'en')

		query, attributes = get_query(query, language)
		logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))

		params['method'] = 'POST'
		params['url'] = SPARQL_ENDPOINT_URL
		params['data'] = {'query': query}
		params['headers'] = get_headers()

		params['language'] = language
		params['attributes'] = attributes

		return params


		def response(resp):

		results = []
		jsonresponse = loads(resp.content.decode())

		language = resp.search_params['language'].lower()
		language = resp.search_params['language']
		attributes = resp.search_params['attributes']
		logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))

		seen_entities = set()

		for result in jsonresponse.get('results', {}).get('bindings', []):
		attribute_result = {key: value['value'] for key, value in result.items()}
		entity_url = attribute_result['item']
		@@ -757,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument
		lang = result['name']['xml:lang']
		entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
		WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()


		def fetch_traits(engine_traits: EngineTraits):
		"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits
		<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
		is not supported by wikidata."""

		_fetch_traits(engine_traits)
		# wikidata does not support zh-classical (zh_Hans)
		engine_traits.languages.pop('zh_Hans')
		# wikidata does not have net-locations for the languages
		engine_traits.custom['wiki_netloc'] = {}

searx/engines/wikipedia.py

+96 −154

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: AGPL-3.0-or-later
		# lint: pylint
		"""This module implements the Wikipedia engine. Some of this implementations
		are shared by other engines:

		- :ref:`wikidata engine`

		The list of supported languages is fetched from the article linked by
		:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
		does not support one Wikipedia for all the languages, but there is one Wikipedia
		for every language (:py:obj:`fetch_traits`).
		"""
		Wikipedia (Web)
		"""

		from urllib.parse import quote
		from json import loads
		import urllib.parse
		import babel

		from lxml import html
		from searx.utils import match_language, searx_useragent

		from searx import network
		from searx.locales import language_tag
		from searx.enginelib.traits import EngineTraits

		engine_traits: EngineTraits
		traits: EngineTraits

		# about
		about = {
		@@ -22,32 +32,40 @@ about = {
		"results": 'JSON',
		}


		send_accept_language_header = True

		# search-url
		search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
		supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
		language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
		wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
		"""The editing depth of Wikipedia is one of several possible rough indicators
		of the encyclopedia's collaborative quality, showing how frequently its articles
		are updated. The measurement of depth was introduced after some limitations of
		the classic measurement of article count were realized.
		"""

		# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
		rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
		"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
		the first paragraph of the page in plain text and HTML as well as the type of
		page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
		and link previews in the apps.

		.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_

		# set language in base_url
		def url_lang(lang):
		lang_pre = lang.split('-')[0]
		if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
		return 'en'
		return match_language(lang, supported_languages, language_aliases).split('-')[0]
		"""


		# do search-request
		def request(query, params):
		"""Assemble a request (`wikipedia rest_v1 summary API`_)."""
		if query.islower():
		query = query.title()

		language = url_lang(params['language'])
		params['url'] = search_url.format(title=quote(query), language=language)
		engine_language = traits.get_language(params['searxng_locale'], 'en')
		wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
		title = urllib.parse.quote(query)

		# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
		# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
		params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)

		params['headers']['User-Agent'] = searx_useragent()
		params['raise_for_httperror'] = False
		params['soft_max_redirects'] = 2

		@@ -56,13 +74,14 @@ def request(query, params):

		# get response from search-request
		def response(resp):

		results = []
		if resp.status_code == 404:
		return []

		if resp.status_code == 400:
		try:
		api_result = loads(resp.text)
		except:
		api_result = resp.json()
		except Exception: # pylint: disable=broad-except
		pass
		else:
		if (
		@@ -73,18 +92,12 @@ def response(resp):

		network.raise_for_httperror(resp)

		results = []
		api_result = loads(resp.text)

		# skip disambiguation pages
		if api_result.get('type') != 'standard':
		return []

		api_result = resp.json()
		title = api_result['title']
		wikipedia_link = api_result['content_urls']['desktop']['page']
		results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})

		results.append({'url': wikipedia_link, 'title': title})

		if api_result.get('type') == 'standard':
		results.append(
		{
		'infobox': title,
		@@ -98,27 +111,6 @@ def response(resp):
		return results


		# get supported languages from their site
		def _fetch_supported_languages(resp):
		supported_languages = {}
		dom = html.fromstring(resp.text)
		tables = dom.xpath('//table[contains(@class,"sortable")]')
		for table in tables:
		# exclude header row
		trs = table.xpath('.//tr')[1:]
		for tr in trs:
		td = tr.xpath('./td')
		code = td[3].xpath('./a')[0].text
		name = td[1].xpath('./a')[0].text
		english_name = td[1].xpath('./a')[0].text
		articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
		# exclude languages with too few articles
		if articles >= 100:
		supported_languages[code] = {"name": name, "english_name": english_name}

		return supported_languages


		# Nonstandard language codes
		#
		# These Wikipedias use language codes that do not conform to the ISO 639
		@@ -135,104 +127,57 @@ lang_map = {
		'nrm': 'nrf',
		'roa-rup': 'rup',
		'nds-nl': 'nds',
		#'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
		#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
		'zh-classical': 'zh_Hant',
		'zh-min-nan': 'nan',
		'zh-yue': 'yue',
		'an': 'arg',
		'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
		}

		unknown_langs = [
		'ab', # Abkhazian
		'alt', # Southern Altai
		'an', # Aragonese
		'ang', # Anglo-Saxon
		'arc', # Aramaic
		'ary', # Moroccan Arabic
		'av', # Avar
		'ba', # Bashkir
		'be-tarask',
		'bar', # Bavarian
		'bcl', # Central Bicolano
		'bh', # Bhojpuri
		'bi', # Bislama
		'bjn', # Banjar
		'blk', # Pa'O
		'bpy', # Bishnupriya Manipuri
		'bxr', # Buryat
		'cbk-zam', # Zamboanga Chavacano
		'co', # Corsican
		'cu', # Old Church Slavonic
		'dty', # Doteli
		'dv', # Divehi
		'ext', # Extremaduran
		'fj', # Fijian
		'frp', # Franco-Provençal
		'gan', # Gan
		'gom', # Goan Konkani
		'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
		'bpy', # Bishnupriya Manipuri is unknown by babel
		'hif', # Fiji Hindi
		'ilo', # Ilokano
		'inh', # Ingush
		'jbo', # Lojban
		'kaa', # Karakalpak
		'kbd', # Kabardian Circassian
		'kg', # Kongo
		'koi', # Komi-Permyak
		'krc', # Karachay-Balkar
		'kv', # Komi
		'lad', # Ladino
		'lbe', # Lak
		'lez', # Lezgian
		'li', # Limburgish
		'ltg', # Latgalian
		'mdf', # Moksha
		'mnw', # Mon
		'mwl', # Mirandese
		'myv', # Erzya
		'na', # Nauruan
		'nah', # Nahuatl
		'nov', # Novial
		'nrm', # Norman
		'pag', # Pangasinan
		'pam', # Kapampangan
		'pap', # Papiamentu
		'pdc', # Pennsylvania German
		'pfl', # Palatinate German
		'roa-rup', # Aromanian
		'sco', # Scots
		'sco', # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
		'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
		'sh', # Serbo-Croatian
		'simple', # simple english is not know as a natural language different to english (babel)
		'sm', # Samoan
		'srn', # Sranan
		'stq', # Saterland Frisian
		'szy', # Sakizaya
		'tcy', # Tulu
		'tet', # Tetum
		'tpi', # Tok Pisin
		'trv', # Seediq
		'ty', # Tahitian
		'tyv', # Tuvan
		'udm', # Udmurt
		'vep', # Vepsian
		'vls', # West Flemish
		'vo', # Volapük
		'wa', # Walloon
		'xal', # Kalmyk
		]


		def fetch_traits(engine_traits: EngineTraits):
		"""Fetch languages from Wikipedia"""
		# pylint: disable=import-outside-toplevel
		"""Fetch languages from Wikipedia.

		engine_traits.data_type = 'supported_languages' # deprecated
		The location of the Wikipedia address of a language is mapped in a
		:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
		(``wiki_netloc``). Here is a reduced example:

		import babel
		from searx.locales import language_tag
		.. code:: python

		traits.custom['wiki_netloc'] = {
		"en": "en.wikipedia.org",
		..
		"gsw": "als.wikipedia.org",
		..
		"zh": "zh.wikipedia.org",
		"zh-classical": "zh-classical.wikipedia.org"
		}

		"""

		engine_traits.custom['wiki_netloc'] = {}

		resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
		# insert alias to map from a region like zh-CN to a language zh_Hans
		engine_traits.languages['zh_Hans'] = 'zh'

		resp = network.get(wikipedia_article_depth)
		if not resp.ok:
		print("ERROR: response from Wikipedia is not OK.")

		@@ -242,34 +187,31 @@ def fetch_traits(engine_traits: EngineTraits):
		cols = row.xpath('./td')
		if not cols:
		continue

		cols = [c.text_content().strip() for c in cols]
		articles = int(cols[4].replace(',', '').replace('-', '0'))
		users = int(cols[8].replace(',', '').replace('-', '0'))
		depth = cols[11].strip('-')

		if articles < 1000:
		depth = float(cols[3].replace('-', '0').replace(',', ''))
		articles = int(cols[4].replace(',', '').replace(',', ''))

		if articles < 10000:
		# exclude languages with too few articles
		continue

		# depth: rough indicator of a Wikipedia’s quality, showing how
		# frequently its articles are updated.
		if depth == '':
		if users < 1000:
		# depth is not calculated --> at least 1000 user should registered
		continue
		elif int(depth) < 20:
		if int(depth) < 20:
		# Rough indicator of a Wikipedia’s quality, showing how frequently
		# its articles are updated.
		continue

		eng_tag = cols[3]
		eng_tag = cols[2]
		wiki_url = row.xpath('./td[3]/a/@href')[0]
		wiki_url = urllib.parse.urlparse(wiki_url)

		if eng_tag in unknown_langs:
		continue

		try:
		sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
		sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
		except babel.UnknownLocaleError:
		print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
		print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
		continue

		conflict = engine_traits.languages.get(sxng_tag)
		@@ -277,6 +219,6 @@ def fetch_traits(engine_traits: EngineTraits):
		if conflict != eng_tag:
		print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
		continue
		engine_traits.languages[sxng_tag] = eng_tag

		engine_traits.languages['zh_Hans'] = 'zh'
		engine_traits.languages[sxng_tag] = eng_tag
		engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc