Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit 5c8d56e7 authored by Markus Heiser's avatar Markus Heiser Committed by GitHub
Browse files

Merge pull request #2316 from return42/fix-2314-upd-desc

[fix] searxng_extra/update/update_engine_descriptions.py 
parents 0adfed19 09295a3f
Loading
Loading
Loading
Loading
+346 −638

File changed.

Preview size limit exceeded, changes collapsed.

+353 −8
Original line number Original line Diff line number Diff line
@@ -3480,6 +3480,7 @@
      "es-US": "es_US",
      "es-US": "es_US",
      "es-UY": "es_UY",
      "es-UY": "es_UY",
      "es-VE": "es_VE",
      "es-VE": "es_VE",
      "et-EE": "et_EE",
      "fi-FI": "fi_FI",
      "fi-FI": "fi_FI",
      "fil-PH": "fil_PH",
      "fil-PH": "fil_PH",
      "fr-BE": "fr_BE",
      "fr-BE": "fr_BE",
@@ -3487,6 +3488,7 @@
      "fr-CH": "fr_CH",
      "fr-CH": "fr_CH",
      "fr-FR": "fr_FR",
      "fr-FR": "fr_FR",
      "hi-IN": "hi_IN",
      "hi-IN": "hi_IN",
      "hu-HU": "hu_HU",
      "id-ID": "id_ID",
      "id-ID": "id_ID",
      "it-CH": "it_CH",
      "it-CH": "it_CH",
      "it-IT": "it_IT",
      "it-IT": "it_IT",
@@ -3514,6 +3516,7 @@
  "wikidata": {
  "wikidata": {
    "all_locale": null,
    "all_locale": null,
    "custom": {
    "custom": {
      "WIKIPEDIA_LANGUAGES": [],
      "wiki_netloc": {}
      "wiki_netloc": {}
    },
    },
    "data_type": "traits_v1",
    "data_type": "traits_v1",
@@ -3556,6 +3559,7 @@
      "ja": "ja",
      "ja": "ja",
      "jv": "jv",
      "jv": "jv",
      "ka": "ka",
      "ka": "ka",
      "km": "km",
      "kn": "kn",
      "kn": "kn",
      "ko": "ko",
      "ko": "ko",
      "lb": "lb",
      "lb": "lb",
@@ -3566,8 +3570,8 @@
      "ml": "ml",
      "ml": "ml",
      "mn": "mn",
      "mn": "mn",
      "mr": "mr",
      "mr": "mr",
      "nb": "no",
      "ne": "ne",
      "ne": "ne",
      "no": "no",
      "or": "or",
      "or": "or",
      "os": "os",
      "os": "os",
      "pa": "pa",
      "pa": "pa",
@@ -3595,13 +3599,345 @@
      "vi": "vi",
      "vi": "vi",
      "yi": "yi",
      "yi": "yi",
      "zh": "zh",
      "zh": "zh",
      "zh_Hant": "zh-classical"
      "zh_Hans": "zh",
      "zh_Hant": "zh"
    },
    },
    "regions": {}
    "regions": {
      "zh-CN": "zh",
      "zh-HK": "zh",
      "zh-MO": "zh",
      "zh-MY": "zh",
      "zh-SG": "zh",
      "zh-TW": "zh",
      "zh-classical": "zh-classical"
    }
  },
  },
  "wikipedia": {
  "wikipedia": {
    "all_locale": null,
    "all_locale": null,
    "custom": {
    "custom": {
      "WIKIPEDIA_LANGUAGES": [
        "ab",
        "ace",
        "ady",
        "af",
        "ak",
        "als",
        "alt",
        "am",
        "ami",
        "an",
        "ang",
        "anp",
        "ar",
        "arc",
        "ary",
        "arz",
        "as",
        "ast",
        "atj",
        "av",
        "avk",
        "awa",
        "ay",
        "az",
        "azb",
        "ba",
        "ban",
        "bar",
        "bat-smg",
        "bcl",
        "be",
        "be-tarask",
        "bg",
        "bh",
        "bi",
        "bjn",
        "blk",
        "bm",
        "bn",
        "bo",
        "bpy",
        "br",
        "bs",
        "bug",
        "bxr",
        "ca",
        "cbk-zam",
        "cdo",
        "ce",
        "ceb",
        "ch",
        "chr",
        "chy",
        "ckb",
        "co",
        "cr",
        "crh",
        "cs",
        "csb",
        "cu",
        "cv",
        "cy",
        "da",
        "dag",
        "de",
        "din",
        "diq",
        "dsb",
        "dty",
        "dv",
        "dz",
        "ee",
        "el",
        "eml",
        "en",
        "eo",
        "es",
        "et",
        "eu",
        "ext",
        "fa",
        "ff",
        "fi",
        "fiu-vro",
        "fj",
        "fo",
        "fr",
        "frp",
        "frr",
        "fur",
        "fy",
        "ga",
        "gag",
        "gan",
        "gcr",
        "gd",
        "gl",
        "glk",
        "gn",
        "gom",
        "gor",
        "got",
        "gu",
        "guc",
        "gur",
        "guw",
        "gv",
        "ha",
        "hak",
        "haw",
        "he",
        "hi",
        "hif",
        "hr",
        "hsb",
        "ht",
        "hu",
        "hy",
        "hyw",
        "ia",
        "id",
        "ie",
        "ig",
        "ik",
        "ilo",
        "inh",
        "io",
        "is",
        "it",
        "iu",
        "ja",
        "jam",
        "jbo",
        "jv",
        "ka",
        "kaa",
        "kab",
        "kbd",
        "kbp",
        "kcg",
        "kg",
        "ki",
        "kk",
        "kl",
        "km",
        "kn",
        "ko",
        "koi",
        "krc",
        "ks",
        "ksh",
        "ku",
        "kv",
        "kw",
        "ky",
        "la",
        "lad",
        "lb",
        "lbe",
        "lez",
        "lfn",
        "lg",
        "li",
        "lij",
        "lld",
        "lmo",
        "ln",
        "lo",
        "lt",
        "ltg",
        "lv",
        "mad",
        "mai",
        "map-bms",
        "mdf",
        "mg",
        "mhr",
        "mi",
        "min",
        "mk",
        "ml",
        "mn",
        "mni",
        "mnw",
        "mr",
        "mrj",
        "ms",
        "mt",
        "mwl",
        "my",
        "myv",
        "mzn",
        "na",
        "nah",
        "nap",
        "nds",
        "nds-nl",
        "ne",
        "new",
        "nia",
        "nl",
        "nn",
        "no",
        "nov",
        "nqo",
        "nrm",
        "nso",
        "nv",
        "ny",
        "oc",
        "olo",
        "om",
        "or",
        "os",
        "pa",
        "pag",
        "pam",
        "pap",
        "pcd",
        "pcm",
        "pdc",
        "pfl",
        "pi",
        "pih",
        "pl",
        "pms",
        "pnb",
        "pnt",
        "ps",
        "pt",
        "pwn",
        "qu",
        "rm",
        "rmy",
        "rn",
        "ro",
        "roa-rup",
        "roa-tara",
        "ru",
        "rue",
        "rw",
        "sa",
        "sah",
        "sat",
        "sc",
        "scn",
        "sco",
        "sd",
        "se",
        "sg",
        "sh",
        "shi",
        "shn",
        "si",
        "simple",
        "sk",
        "skr",
        "sl",
        "sm",
        "smn",
        "sn",
        "so",
        "sq",
        "sr",
        "srn",
        "ss",
        "st",
        "stq",
        "su",
        "sv",
        "sw",
        "szl",
        "szy",
        "ta",
        "tay",
        "tcy",
        "te",
        "tet",
        "tg",
        "th",
        "ti",
        "tk",
        "tl",
        "tn",
        "to",
        "tpi",
        "tr",
        "trv",
        "ts",
        "tt",
        "tum",
        "tw",
        "ty",
        "tyv",
        "udm",
        "ug",
        "uk",
        "ur",
        "uz",
        "ve",
        "vec",
        "vep",
        "vi",
        "vls",
        "vo",
        "wa",
        "war",
        "wo",
        "wuu",
        "xal",
        "xh",
        "xmf",
        "yi",
        "yo",
        "za",
        "zea",
        "zh",
        "zh-classical",
        "zh-min-nan",
        "zh-yue",
        "zu"
      ],
      "wiki_netloc": {
      "wiki_netloc": {
        "af": "af.wikipedia.org",
        "af": "af.wikipedia.org",
        "als": "als.wikipedia.org",
        "als": "als.wikipedia.org",
@@ -3640,6 +3976,7 @@
        "ja": "ja.wikipedia.org",
        "ja": "ja.wikipedia.org",
        "jv": "jv.wikipedia.org",
        "jv": "jv.wikipedia.org",
        "ka": "ka.wikipedia.org",
        "ka": "ka.wikipedia.org",
        "km": "km.wikipedia.org",
        "kn": "kn.wikipedia.org",
        "kn": "kn.wikipedia.org",
        "ko": "ko.wikipedia.org",
        "ko": "ko.wikipedia.org",
        "lb": "lb.wikipedia.org",
        "lb": "lb.wikipedia.org",
@@ -3679,8 +4016,7 @@
        "uz": "uz.wikipedia.org",
        "uz": "uz.wikipedia.org",
        "vi": "vi.wikipedia.org",
        "vi": "vi.wikipedia.org",
        "yi": "yi.wikipedia.org",
        "yi": "yi.wikipedia.org",
        "zh": "zh.wikipedia.org",
        "zh": "zh.wikipedia.org"
        "zh-classical": "zh-classical.wikipedia.org"
      }
      }
    },
    },
    "data_type": "traits_v1",
    "data_type": "traits_v1",
@@ -3723,6 +4059,7 @@
      "ja": "ja",
      "ja": "ja",
      "jv": "jv",
      "jv": "jv",
      "ka": "ka",
      "ka": "ka",
      "km": "km",
      "kn": "kn",
      "kn": "kn",
      "ko": "ko",
      "ko": "ko",
      "lb": "lb",
      "lb": "lb",
@@ -3733,8 +4070,8 @@
      "ml": "ml",
      "ml": "ml",
      "mn": "mn",
      "mn": "mn",
      "mr": "mr",
      "mr": "mr",
      "nb": "no",
      "ne": "ne",
      "ne": "ne",
      "no": "no",
      "or": "or",
      "or": "or",
      "os": "os",
      "os": "os",
      "pa": "pa",
      "pa": "pa",
@@ -3763,9 +4100,17 @@
      "yi": "yi",
      "yi": "yi",
      "zh": "zh",
      "zh": "zh",
      "zh_Hans": "zh",
      "zh_Hans": "zh",
      "zh_Hant": "zh-classical"
      "zh_Hant": "zh"
    },
    },
    "regions": {}
    "regions": {
      "zh-CN": "zh",
      "zh-HK": "zh",
      "zh-MO": "zh",
      "zh-MY": "zh",
      "zh-SG": "zh",
      "zh-TW": "zh",
      "zh-classical": "zh-classical"
    }
  },
  },
  "yahoo": {
  "yahoo": {
    "all_locale": "any",
    "all_locale": "any",
+2 −2
Original line number Original line Diff line number Diff line
@@ -13,7 +13,7 @@ used.
from __future__ import annotations
from __future__ import annotations
import json
import json
import dataclasses
import dataclasses
from typing import Dict, Union, Callable, Optional, TYPE_CHECKING
from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING
from typing_extensions import Literal, Self
from typing_extensions import Literal, Self


from searx import locales
from searx import locales
@@ -81,7 +81,7 @@ class EngineTraits:
    """Data type, default is 'traits_v1'.
    """Data type, default is 'traits_v1'.
    """
    """


    custom: Dict[str, Dict] = dataclasses.field(default_factory=dict)
    custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict)
    """A place to store engine's custom traits, not related to the SearXNG core
    """A place to store engine's custom traits, not related to the SearXNG core


    """
    """
+20 −15
Original line number Original line Diff line number Diff line
@@ -18,7 +18,10 @@ from searx.data import WIKIDATA_UNITS
from searx.network import post, get
from searx.network import post, get
from searx.utils import searx_useragent, get_string_replaces_function
from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import fetch_traits as _fetch_traits
from searx.engines.wikipedia import (
    fetch_wikimedia_traits,
    get_wiki_params,
)
from searx.enginelib.traits import EngineTraits
from searx.enginelib.traits import EngineTraits


if TYPE_CHECKING:
if TYPE_CHECKING:
@@ -165,17 +168,15 @@ def request(query, params):


    # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
    # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
    # mapped to zh
    # mapped to zh
    sxng_lang = params['searxng_locale'].split('-')[0]
    eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
    language = traits.get_language(sxng_lang, 'en')
    query, attributes = get_query(query, eng_tag)

    logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes))
    query, attributes = get_query(query, language)
    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))


    params['method'] = 'POST'
    params['method'] = 'POST'
    params['url'] = SPARQL_ENDPOINT_URL
    params['url'] = SPARQL_ENDPOINT_URL
    params['data'] = {'query': query}
    params['data'] = {'query': query}
    params['headers'] = get_headers()
    params['headers'] = get_headers()
    params['language'] = language
    params['language'] = eng_tag
    params['attributes'] = attributes
    params['attributes'] = attributes


    return params
    return params
@@ -769,12 +770,16 @@ def init(engine_settings=None): # pylint: disable=unused-argument




def fetch_traits(engine_traits: EngineTraits):
def fetch_traits(engine_traits: EngineTraits):
    """Use languages evaluated from :py:obj:`wikipedia.fetch_traits
    """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits
    <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
    <searx.engines.wikipedia.fetch_wikimedia_traits>` and removes
    is not supported by wikidata."""


    - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for
    _fetch_traits(engine_traits)
      the languages and the list of all
    # wikidata does not support zh-classical (zh_Hans)

    engine_traits.languages.pop('zh_Hans')
    - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine
    # wikidata does not have net-locations for the languages

    """

    fetch_wikimedia_traits(engine_traits)
    engine_traits.custom['wiki_netloc'] = {}
    engine_traits.custom['wiki_netloc'] = {}
    engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []
+167 −74
Original line number Original line Diff line number Diff line
@@ -5,10 +5,54 @@ are shared by other engines:


- :ref:`wikidata engine`
- :ref:`wikidata engine`


The list of supported languages is fetched from the article linked by
The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
:py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
the article linked by :py:obj:`list_of_wikipedias`.
does not support one Wikipedia for all the languages, but there is one Wikipedia

for every language (:py:obj:`fetch_traits`).
Unlike traditional search engines, wikipedia does not support one Wikipedia for
all languages, but there is one Wikipedia for each supported language. Some of
these Wikipedias have a LanguageConverter_ enabled
(:py:obj:`rest_v1_summary_url`).

A LanguageConverter_ (LC) is a system based on language variants that
automatically converts the content of a page into a different variant. A variant
is mostly the same language in a different script.

- `Wikipedias in multiple writing systems`_
- `Automatic conversion between traditional and simplified Chinese characters`_

PR-2554_:
  The Wikipedia link returned by the API is still the same in all cases
  (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
  ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
  or .. Wikipedia's LC automatically returns the desired script in their
  web-page.

  - You can test the API here: https://reqbin.com/gesg2kvx

.. _https://zh.wikipedia.org/wiki/出租車:
   https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A

To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
:py:obj:`fetch_wikimedia_traits` function.

To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
options:

- ``!wp 出租車 :zh``    should show 出租車
- ``!wp 出租車 :zh-CN`` should show 出租车
- ``!wp 出租車 :zh-TW`` should show 計程車
- ``!wp 出租車 :zh-HK`` should show 的士
- ``!wp 出租車 :zh-SG`` should show 德士

.. _LanguageConverter:
   https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
.. _Wikipedias in multiple writing systems:
   https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
.. _Automatic conversion between traditional and simplified Chinese characters:
   https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
.. _PR-2554: https://github.com/searx/searx/pull/2554

"""
"""


import urllib.parse
import urllib.parse
@@ -16,8 +60,9 @@ import babel


from lxml import html
from lxml import html


from searx import utils
from searx import network
from searx import network
from searx.locales import language_tag
from searx import locales
from searx.enginelib.traits import EngineTraits
from searx.enginelib.traits import EngineTraits


traits: EngineTraits
traits: EngineTraits
@@ -33,6 +78,12 @@ about = {
}
}


send_accept_language_header = True
send_accept_language_header = True
"""The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled."""

list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
"""


wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
"""The *editing depth* of Wikipedia is one of several possible rough indicators
"""The *editing depth* of Wikipedia is one of several possible rough indicators
@@ -41,16 +92,59 @@ are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
the classic measurement of article count were realized.
"""
"""


# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
"""
the first paragraph of the page in plain text and HTML as well as the type of
`wikipedia rest_v1 summary API`_:
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
  The summary response includes an extract of the first paragraph of the page in
and link previews in the apps.
  plain text and HTML as well as the type of page. This is useful for page
  previews (fka. Hovercards, aka. Popups) on the web and link previews in the
  apps.

HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
  The desired language variant code for wikis where LanguageConverter_ is
  enabled.

.. _wikipedia rest_v1 summary API:
   https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_

"""

wiki_lc_locale_variants = {
    "zh": (
        "zh-CN",
        "zh-HK",
        "zh-MO",
        "zh-MY",
        "zh-SG",
        "zh-TW",
    ),
    "zh-classical": ("zh-classical",),
}
"""Mapping rule of the LanguageConverter_ to map a language and its variants to
a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
Chinese`_.

.. _LC Chinese:
   https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
"""

wikipedia_script_variants = {
    "zh": (
        "zh_Hant",
        "zh_Hans",
    )
}


.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_

def get_wiki_params(sxng_locale, eng_traits):
    """Returns the Wikipedia language tag and the netloc that fits to the
    ``sxng_locale``.  To support LanguageConverter_ this function rates a locale
    (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).


    """
    """
    eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
    wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
    return eng_tag, wiki_netloc




def request(query, params):
def request(query, params):
@@ -58,12 +152,8 @@ def request(query, params):
    if query.islower():
    if query.islower():
        query = query.title()
        query = query.title()


    engine_language = traits.get_language(params['searxng_locale'], 'en')
    _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
    title = urllib.parse.quote(query)
    title = urllib.parse.quote(query)

    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)


    params['raise_for_httperror'] = False
    params['raise_for_httperror'] = False
@@ -93,7 +183,7 @@ def response(resp):
    network.raise_for_httperror(resp)
    network.raise_for_httperror(resp)


    api_result = resp.json()
    api_result = resp.json()
    title = api_result['title']
    title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
    wikipedia_link = api_result['content_urls']['desktop']['page']
    wikipedia_link = api_result['content_urls']['desktop']['page']
    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})


@@ -116,7 +206,9 @@ def response(resp):
# These Wikipedias use language codes that do not conform to the ISO 639
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).
# standard (which is how wiki subdomains are chosen nowadays).


lang_map = {
lang_map = locales.LOCALE_BEST_MATCH.copy()
lang_map.update(
    {
        'be-tarask': 'bel',
        'be-tarask': 'bel',
        'ak': 'aka',
        'ak': 'aka',
        'als': 'gsw',
        'als': 'gsw',
@@ -124,6 +216,7 @@ lang_map = {
        'cbk-zam': 'cbk',
        'cbk-zam': 'cbk',
        'fiu-vro': 'vro',
        'fiu-vro': 'vro',
        'map-bms': 'map',
        'map-bms': 'map',
        'no': 'nb-NO',
        'nrm': 'nrf',
        'nrm': 'nrf',
        'roa-rup': 'rup',
        'roa-rup': 'rup',
        'nds-nl': 'nds',
        'nds-nl': 'nds',
@@ -131,29 +224,20 @@ lang_map = {
        'zh-min-nan': 'nan',
        'zh-min-nan': 'nan',
        'zh-yue': 'yue',
        'zh-yue': 'yue',
        'an': 'arg',
        'an': 'arg',
    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
    }
    }

)
unknown_langs = [
    'an',  # Aragonese
    'ba',  # Bashkir
    'bar',  # Bavarian
    'bcl',  # Central Bicolano
    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
    'bpy',  # Bishnupriya Manipuri is unknown by babel
    'hif',  # Fiji Hindi
    'ilo',  # Ilokano
    'li',  # Limburgish
    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
    'sh',  # Serbo-Croatian
    'simple',  # simple english is not know as a natural language different to english (babel)
    'vo',  # Volapük
    'wa',  # Walloon
]




def fetch_traits(engine_traits: EngineTraits):
def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages from Wikipedia.
    fetch_wikimedia_traits(engine_traits)
    print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))


def fetch_wikimedia_traits(engine_traits: EngineTraits):
    """Fetch languages from Wikipedia.  Not all languages from the
    :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
    known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
    :py:obj:`editing depth <wikipedia_article_depth>`.


    The location of the Wikipedia address of a language is mapped in a
    The location of the Wikipedia address of a language is mapped in a
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
@@ -169,15 +253,21 @@ def fetch_traits(engine_traits: EngineTraits):
           "zh": "zh.wikipedia.org",
           "zh": "zh.wikipedia.org",
           "zh-classical": "zh-classical.wikipedia.org"
           "zh-classical": "zh-classical.wikipedia.org"
       }
       }

    """
    """

    # pylint: disable=too-many-branches
    engine_traits.custom['wiki_netloc'] = {}
    engine_traits.custom['wiki_netloc'] = {}
    engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []

    # insert alias to map from a script or region to a wikipedia variant


    # insert alias to map from a region like zh-CN to a language zh_Hans
    for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
    engine_traits.languages['zh_Hans'] = 'zh'
        for sxng_tag in sxng_tag_list:
            engine_traits.languages[sxng_tag] = eng_tag
    for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
        for sxng_tag in sxng_tag_list:
            engine_traits.regions[sxng_tag] = eng_tag


    resp = network.get(wikipedia_article_depth)
    resp = network.get(list_of_wikipedias)
    if not resp.ok:
    if not resp.ok:
        print("ERROR: response from Wikipedia is not OK.")
        print("ERROR: response from Wikipedia is not OK.")


@@ -189,29 +279,30 @@ def fetch_traits(engine_traits: EngineTraits):
            continue
            continue
        cols = [c.text_content().strip() for c in cols]
        cols = [c.text_content().strip() for c in cols]


        depth = float(cols[3].replace('-', '0').replace(',', ''))
        depth = float(cols[11].replace('-', '0').replace(',', ''))
        articles = int(cols[4].replace(',', '').replace(',', ''))
        articles = int(cols[4].replace(',', '').replace(',', ''))


        if articles < 10000:
        eng_tag = cols[3]
            # exclude languages with too few articles
        wiki_url = row.xpath('./td[4]/a/@href')[0]
            continue
        wiki_url = urllib.parse.urlparse(wiki_url)


        if int(depth) < 20:
        try:
            # Rough indicator of a Wikipedia’s quality, showing how frequently
            sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
            # its articles are updated.
        except babel.UnknownLocaleError:
            # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
            continue
            continue
        finally:
            engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)


        eng_tag = cols[2]
        if sxng_tag not in locales.LOCALE_NAMES:
        wiki_url = row.xpath('./td[3]/a/@href')[0]
        wiki_url = urllib.parse.urlparse(wiki_url)


        if eng_tag in unknown_langs:
            if articles < 10000:
                # exclude languages with too few articles
                continue
                continue


        try:
            if int(depth) < 20:
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
                # Rough indicator of a Wikipedia’s quality, showing how
        except babel.UnknownLocaleError:
                # frequently its articles are updated.
            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
                continue
                continue


        conflict = engine_traits.languages.get(sxng_tag)
        conflict = engine_traits.languages.get(sxng_tag)
@@ -222,3 +313,5 @@ def fetch_traits(engine_traits: EngineTraits):


        engine_traits.languages[sxng_tag] = eng_tag
        engine_traits.languages[sxng_tag] = eng_tag
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc

    engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()
Loading