Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 13e8a21f authored by Israel Yago Pereira's avatar Israel Yago Pereira
Browse files

Merge branch '4308-improve-accuracy' into 'master'

Improve search language accuracy

Closes e/backlog#4308

See merge request e/cloud/my-spot!90
parents afa07cf5 34f46595
Loading
Loading
Loading
Loading
+3 −5
Original line number Diff line number Diff line
@@ -57,12 +57,10 @@ Then go to http://localhost:8088.

### For developer

You can directly run spot, with a python command inside a docker container which
contains all dependencies. After running the docker-compose stack you can stop
the default spot service and run the debug one with Flask server.
You can only run spot, with a python command inside a docker container which
contains all dependencies.

```
docker-compose rm -sf spot
docker run -it --rm -v $(pwd):/ws -w /ws --hostname spot --network=my-spot_default --env-file .env registry.gitlab.e.foundation:5000/e/cloud/my-spot/env bash
docker run -it --rm -v $(pwd):/ws -w /ws -e SEARX_UI_DEFAULT_THEME=etheme -p 8088:80 registry.gitlab.e.foundation:5000/e/cloud/my-spot/env bash
PYTHONPATH=$(pwd) SEARX_DEBUG=1 python -X dev searx/webapp.py
```
+83 −27
Original line number Diff line number Diff line
@@ -134,42 +134,113 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a'


def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
    ret_val = {}
    """Composing various language properties for the google engines.

    This function is called by the various google engines (google itself,
    google-images, -news, -scholar, -videos).

    :param dict param: request parameters of the engine

    :param list lang_list: list of supported languages of the engine
        :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`

    :param dict lang_list: custom aliases for non standard language codes
        (used when calling :py:func:`searx.utils.match_language)

    :param bool supported_any_language: When a language is not specified, the
        language interpretation is left up to Google to decide how the search
        results should be delivered.  This argument is ``True`` for the google
        engine and ``False`` for the other engines (google-images, -news,
        -scholar, -videos).

    :rtype: dict
    :returns:
        Py-Dictionary with the key/value pairs:

        language:
            Return value from :py:func:`searx.utils.match_language

        country:
            The country code (e.g. US, AT, CA, FR, DE ..)

        subdomain:
            Google subdomain :py:obj:`google_domains` that fits to the country
            code.

        params:
            Py-Dictionary with additional request arguments (can be passed to
            :py:func:`urllib.parse.urlencode`).

        headers:
            Py-Dictionary with additional HTTP headers (can be passed to
            request's headers)
    """
    ret_val = {
        'language' : None,
        'country' : None,
        'subdomain' : None,
        'params' : {},
        'headers' : {},
    }

    # language ...

    _lang = params['language']
    _any_language = _lang.lower() == 'all'
    if _any_language:
        _lang = 'en-US'

    language = match_language(_lang, lang_list, custom_aliases)
    ret_val['language'] = language

    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
    _l = _lang.split('-')
    # country ...

    # the country code (US, AT, CA)
    _l = _lang.split('-')
    if len(_l) == 2:
        country = _l[1]
    else:
        country = _l[0].upper()
        if country == 'EN':
            country = 'US'

    ret_val['country'] = country

    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
    lang_country = '%s-%s' % (language, country)
    # subdomain ...

    # subdomain
    ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com')

    ret_val['params'] = {}
    ret_val['headers'] = {}
    # params & headers

    lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..)

    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

    ret_val['params']['hl'] = lang_list.get(lang_country, language)

    # lr parameter:
    #   The lr (language restrict) parameter restricts search results to
    #   documents written in a particular language.
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   Language Collection Values:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections

    if _any_language and supported_any_language:
        # based on whoogle

        # interpretation is left up to Google (based on whoogle)
        #
        # - add parameter ``source=lnt``
        # - don't use parameter ``lr``
        # - don't add a ``Accept-Language`` HTTP header.

        ret_val['params']['source'] = 'lnt'

    else:

        # restricts search results to documents written in a particular
        # language.
        ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language

        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
        ret_val['headers']['Accept-Language'] = ','.join([
            lang_country,
@@ -178,18 +249,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
            '*;q=0.5',
        ])

        # lr parameter:
        #   https://developers.google.com/custom-search/docs/xml_results#lrsp
        # Language Collection Values:
        #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
        ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language

    ret_val['params']['hl'] = lang_country if lang_country in lang_list else language

    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
    return ret_val

def detect_google_sorry(resp):
@@ -228,11 +287,8 @@ def request(query, params):
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url

    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
    params['headers'].update(lang_info['headers'])
    if use_mobile_ui:
        params['headers']['Accept'] = '*/*'
+1 −0
Original line number Diff line number Diff line
@@ -1154,6 +1154,7 @@ def run():
    app.run(
        debug=searx_debug,
        use_debugger=searx_debug,
        load_dotenv=False,
        port=settings['server']['port'],
        host=settings['server']['bind_address'],
        threaded=True,