Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7daf4f95 authored by Markus Heiser's avatar Markus Heiser
Browse files

[mod] Wikipedia: fetch engine traits (data_type: supported_languages)



Implements a fetch_traits function for the Wikipedia engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: default avatarMarkus Heiser <markus.heiser@darmarit.de>
parent f78f9083
Loading
Loading
Loading
Loading
+220 −2
Original line number Diff line number Diff line
@@ -5121,7 +5121,116 @@
    "all_locale": null,
    "custom": {},
    "data_type": "supported_languages",
    "languages": {},
    "languages": {
      "af": "af",
      "ak": "tw",
      "am": "am",
      "ar": "ar",
      "as": "as",
      "az": "az",
      "be": "be",
      "bg": "bg",
      "bn": "bn",
      "bo": "bo",
      "bs": "bs",
      "ca": "ca",
      "chr": "chr",
      "ckb": "ckb",
      "cs": "cs",
      "da": "da",
      "de": "de",
      "dsb": "dsb",
      "el": "el",
      "en": "en",
      "es": "es",
      "et": "et",
      "fa": "fa",
      "fi": "fi",
      "fil": "tl",
      "fo": "fo",
      "fr": "fr",
      "fur": "fur",
      "fy": "fy",
      "gl": "gl",
      "gsw": "als",
      "gu": "gu",
      "gv": "gv",
      "haw": "haw",
      "he": "he",
      "hi": "hi",
      "hsb": "hsb",
      "hu": "hu",
      "hy": "hy",
      "id": "id",
      "is": "is",
      "it": "it",
      "ja": "ja",
      "jv": "jv",
      "ka": "ka",
      "km": "km",
      "kn": "kn",
      "ko": "ko",
      "ks": "ks",
      "ksh": "ksh",
      "kw": "kw",
      "lb": "lb",
      "lg": "lg",
      "ln": "ln",
      "lo": "lo",
      "lt": "lt",
      "lv": "lv",
      "mai": "mai",
      "mk": "mk",
      "ml": "ml",
      "mn": "mn",
      "mr": "mr",
      "ms": "ms",
      "mt": "mt",
      "nds": "nds-nl",
      "ne": "ne",
      "no": "no",
      "om": "om",
      "or": "or",
      "os": "os",
      "pa": "pa",
      "pl": "pl",
      "ps": "ps",
      "pt": "pt",
      "qu": "qu",
      "rm": "rm",
      "ro": "ro",
      "ru": "ru",
      "rw": "rw",
      "sa": "sa",
      "sah": "sah",
      "sd": "sd",
      "se": "se",
      "shi": "shi",
      "si": "si",
      "sk": "sk",
      "sl": "sl",
      "smn": "smn",
      "so": "so",
      "sq": "sq",
      "sr": "sr",
      "ta": "ta",
      "te": "te",
      "th": "th",
      "tk": "tk",
      "to": "to",
      "tr": "tr",
      "ug": "ug",
      "uk": "uk",
      "ur": "ur",
      "uz": "uz",
      "vi": "vi",
      "wo": "wo",
      "xh": "xh",
      "yi": "yi",
      "zh": "zh",
      "zh_Hans": "zh",
      "zh_Hant": "zh-classical"
    },
    "regions": {},
    "supported_languages": {
      "ab": {
@@ -6402,7 +6511,116 @@
    "all_locale": null,
    "custom": {},
    "data_type": "supported_languages",
    "languages": {},
    "languages": {
      "af": "af",
      "ak": "tw",
      "am": "am",
      "ar": "ar",
      "as": "as",
      "az": "az",
      "be": "be",
      "bg": "bg",
      "bn": "bn",
      "bo": "bo",
      "bs": "bs",
      "ca": "ca",
      "chr": "chr",
      "ckb": "ckb",
      "cs": "cs",
      "da": "da",
      "de": "de",
      "dsb": "dsb",
      "el": "el",
      "en": "en",
      "es": "es",
      "et": "et",
      "fa": "fa",
      "fi": "fi",
      "fil": "tl",
      "fo": "fo",
      "fr": "fr",
      "fur": "fur",
      "fy": "fy",
      "gl": "gl",
      "gsw": "als",
      "gu": "gu",
      "gv": "gv",
      "haw": "haw",
      "he": "he",
      "hi": "hi",
      "hsb": "hsb",
      "hu": "hu",
      "hy": "hy",
      "id": "id",
      "is": "is",
      "it": "it",
      "ja": "ja",
      "jv": "jv",
      "ka": "ka",
      "km": "km",
      "kn": "kn",
      "ko": "ko",
      "ks": "ks",
      "ksh": "ksh",
      "kw": "kw",
      "lb": "lb",
      "lg": "lg",
      "ln": "ln",
      "lo": "lo",
      "lt": "lt",
      "lv": "lv",
      "mai": "mai",
      "mk": "mk",
      "ml": "ml",
      "mn": "mn",
      "mr": "mr",
      "ms": "ms",
      "mt": "mt",
      "nds": "nds-nl",
      "ne": "ne",
      "no": "no",
      "om": "om",
      "or": "or",
      "os": "os",
      "pa": "pa",
      "pl": "pl",
      "ps": "ps",
      "pt": "pt",
      "qu": "qu",
      "rm": "rm",
      "ro": "ro",
      "ru": "ru",
      "rw": "rw",
      "sa": "sa",
      "sah": "sah",
      "sd": "sd",
      "se": "se",
      "shi": "shi",
      "si": "si",
      "sk": "sk",
      "sl": "sl",
      "smn": "smn",
      "so": "so",
      "sq": "sq",
      "sr": "sr",
      "ta": "ta",
      "te": "te",
      "th": "th",
      "tk": "tk",
      "to": "to",
      "tr": "tr",
      "ug": "ug",
      "uk": "uk",
      "ur": "ur",
      "uz": "uz",
      "vi": "vi",
      "wo": "wo",
      "xh": "xh",
      "yi": "yi",
      "zh": "zh",
      "zh_Hans": "zh",
      "zh_Hant": "zh-classical"
    },
    "regions": {},
    "supported_languages": {
      "ab": {
+1 −0
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import (  # pylint: disable=unused-import
    fetch_traits,
    _fetch_supported_languages,
    supported_languages_url,
)
+170 −4
Original line number Diff line number Diff line
@@ -5,9 +5,12 @@

from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from lxml import html
from searx.utils import match_language, searx_useragent
from searx.network import raise_for_httperror
from searx import network
from searx.enginelib.traits import EngineTraits

engine_traits: EngineTraits

# about
about = {
@@ -68,7 +71,7 @@ def response(resp):
            ):
                return []

    raise_for_httperror(resp)
    network.raise_for_httperror(resp)

    results = []
    api_result = loads(resp.text)
@@ -98,7 +101,7 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
    supported_languages = {}
    dom = fromstring(resp.text)
    dom = html.fromstring(resp.text)
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
@@ -114,3 +117,166 @@ def _fetch_supported_languages(resp):
                supported_languages[code] = {"name": name, "english_name": english_name}

    return supported_languages


# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).

lang_map = {
    'be-tarask': 'bel',
    'ak': 'aka',
    'als': 'gsw',
    'bat-smg': 'sgs',
    'cbk-zam': 'cbk',
    'fiu-vro': 'vro',
    'map-bms': 'map',
    'nrm': 'nrf',
    'roa-rup': 'rup',
    'nds-nl': 'nds',
    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
    'zh-classical': 'zh_Hant',
    'zh-min-nan': 'nan',
    'zh-yue': 'yue',
    'an': 'arg',
}

unknown_langs = [
    'ab',  # Abkhazian
    'alt',  # Southern Altai
    'an',  # Aragonese
    'ang',  # Anglo-Saxon
    'arc',  # Aramaic
    'ary',  # Moroccan Arabic
    'av',  # Avar
    'ba',  # Bashkir
    'be-tarask',
    'bar',  # Bavarian
    'bcl',  # Central Bicolano
    'bh',  # Bhojpuri
    'bi',  # Bislama
    'bjn',  # Banjar
    'blk',  # Pa'O
    'bpy',  # Bishnupriya Manipuri
    'bxr',  # Buryat
    'cbk-zam',  # Zamboanga Chavacano
    'co',  # Corsican
    'cu',  # Old Church Slavonic
    'dty',  # Doteli
    'dv',  # Divehi
    'ext',  # Extremaduran
    'fj',  # Fijian
    'frp',  # Franco-Provençal
    'gan',  # Gan
    'gom',  # Goan Konkani
    'hif',  # Fiji Hindi
    'ilo',  # Ilokano
    'inh',  # Ingush
    'jbo',  # Lojban
    'kaa',  # Karakalpak
    'kbd',  # Kabardian Circassian
    'kg',  # Kongo
    'koi',  # Komi-Permyak
    'krc',  # Karachay-Balkar
    'kv',  # Komi
    'lad',  # Ladino
    'lbe',  # Lak
    'lez',  # Lezgian
    'li',  # Limburgish
    'ltg',  # Latgalian
    'mdf',  # Moksha
    'mnw',  # Mon
    'mwl',  # Mirandese
    'myv',  # Erzya
    'na',  # Nauruan
    'nah',  # Nahuatl
    'nov',  # Novial
    'nrm',  # Norman
    'pag',  # Pangasinan
    'pam',  # Kapampangan
    'pap',  # Papiamentu
    'pdc',  # Pennsylvania German
    'pfl',  # Palatinate German
    'roa-rup',  # Aromanian
    'sco',  # Scots
    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
    'sh',  # Serbo-Croatian
    'simple',  # simple english is not know as a natural language different to english (babel)
    'sm',  # Samoan
    'srn',  # Sranan
    'stq',  # Saterland Frisian
    'szy',  # Sakizaya
    'tcy',  # Tulu
    'tet',  # Tetum
    'tpi',  # Tok Pisin
    'trv',  # Seediq
    'ty',  # Tahitian
    'tyv',  # Tuvan
    'udm',  # Udmurt
    'vep',  # Vepsian
    'vls',  # West Flemish
    'vo',  # Volapük
    'wa',  # Walloon
    'xal',  # Kalmyk
]


def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages from Wikipedia"""
    # pylint: disable=import-outside-toplevel

    engine_traits.data_type = 'supported_languages'  # deprecated

    import babel
    from searx.locales import language_tag

    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
    if not resp.ok:
        print("ERROR: response from Wikipedia is not OK.")

    dom = html.fromstring(resp.text)
    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):

        cols = row.xpath('./td')
        if not cols:
            continue

        cols = [c.text_content().strip() for c in cols]
        articles = int(cols[4].replace(',', '').replace('-', '0'))
        users = int(cols[8].replace(',', '').replace('-', '0'))
        depth = cols[11].strip('-')

        if articles < 1000:
            # exclude languages with too few articles
            continue

        # depth: rough indicator of a Wikipedia’s quality, showing how
        #        frequently its articles are updated.
        if depth == '':
            if users < 1000:
                # depth is not calculated --> at least 1000 user should registered
                continue
        elif int(depth) < 20:
            continue

        eng_tag = cols[3]

        if eng_tag in unknown_langs:
            continue

        try:
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
        except babel.UnknownLocaleError:
            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
            continue

        conflict = engine_traits.languages.get(sxng_tag)
        if conflict:
            if conflict != eng_tag:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
            continue
        engine_traits.languages[sxng_tag] = eng_tag

    engine_traits.languages['zh_Hans'] = 'zh'