Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9ae025b6 authored by Arnau Vàzquez's avatar Arnau Vàzquez
Browse files

Merge branch 'dev/fix-google-engine' into 'master'

pull changes from searxng for google engine

See merge request !160
parents ea784a02 1bc54fc4
Loading
Loading
Loading
Loading
+189 −157
Original line number Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Web)
For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_.
.. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
"""
# lint: pylint
"""This is the implementation of the google WEB engine.  Some of this
implementations are shared by other engines:

- :ref:`google images engine`
- :ref:`google news engine`
- :ref:`google videos engine`

The google WEB engine itself has a special setup option:

.. code:: yaml

# pylint: disable=invalid-name, missing-function-docstring, too-many-branches
  - name: google
    ...
    use_mobile_ui: false

from urllib.parse import urlencode, urlparse
from random import random
``use_mobile_ui``: (default: ``false``)
  Enables to use *mobile endpoint* to bypass the google blocking (see
  :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More
  results` is not affected by Google rate limiting and we can still do requests
  while actively blocked by the original Google search.  By activate
  ``use_mobile_ui`` this behavior is simulated by adding the parameter
  ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.

"""

from urllib.parse import urlencode
from lxml import html
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -28,10 +44,11 @@ about = {
}

# engine dependent config
categories = ['general']
categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
send_accept_language_header = True
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'

@@ -87,119 +104,146 @@ google_domains = {
    'UA': 'google.com.ua',  # Ukraine
    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'HK': 'google.com.hk',  # Hong Kong
    'TW': 'google.com.tw'   # Taiwan
    'TW': 'google.com.tw',  # Taiwan
}

time_range_dict = {
    'day': 'd',
    'week': 'w',
    'month': 'm',
    'year': 'y'
}
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}

# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {
    0: 'off',
    1: 'medium',
    2: 'high'
}
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}

# specific xpath variables
# ------------------------

# google results are grouped into <div class="jtfYYd ..." ../>
results_xpath = '//div[contains(@class, "jtfYYd")]'
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
results_xpath = './/div[@data-sokoban-container]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]'

# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'

# the title is a h3 tag relative to the result group
title_xpath = './/h3[1]'

# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
# href=...>
href_xpath = './/div[@class="yuRUbf"]//a/@href'

# in the result group there is <div class="VwiC3b ..." ../> containing the *content*
content_xpath = './/div[contains(@class, "VwiC3b")]'

# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "card-section")]//a'

# Since google does *auto-correction* on the first query these are not really
# *spelling suggestions*, we use them anyway.
spelling_suggestion_xpath = '//div[@class="med"]/p/a'
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'


def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
    ret_val = {}
    """Composing various language properties for the google engines.

    This function is called by the various google engines (:ref:`google web
    engine`, :ref:`google images engine`, :ref:`google news engine` and
    :ref:`google videos engine`).

    :param dict param: request parameters of the engine

    :param list lang_list: list of supported languages of the engine
        :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`

    :param dict lang_list: custom aliases for non standard language codes
        (used when calling :py:func:`searx.utils.match_language`)

    :param bool supported_any_language: When a language is not specified, the
        language interpretation is left up to Google to decide how the search
        results should be delivered.  This argument is ``True`` for the google
        engine and ``False`` for the other engines (google-images, -news,
        -scholar, -videos).

    :rtype: dict
    :returns:
        Py-Dictionary with the key/value pairs:

        language:
            Return value from :py:func:`searx.utils.match_language`

        country:
            The country code (e.g. US, AT, CA, FR, DE ..)

        subdomain:
            Google subdomain :py:obj:`google_domains` that fits to the country
            code.

        params:
            Py-Dictionary with additional request arguments (can be passed to
            :py:func:`urllib.parse.urlencode`).

        headers:
            Py-Dictionary with additional HTTP headers (can be passed to
            request's headers)
    """
    ret_val = {
        'language': None,
        'country': None,
        'subdomain': None,
        'params': {},
        'headers': {},
    }

    # language ...

    _lang = params['language']
    _any_language = _lang.lower() == 'all'
    if _any_language:
        _lang = 'en-US'

    language = match_language(_lang, lang_list, custom_aliases)
    ret_val['language'] = language

    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
    _l = _lang.split('-')
    # country ...

    # the country code (US, AT, CA)
    _l = _lang.split('-')
    if len(_l) == 2:
        country = _l[1]
    else:
        country = _l[0].upper()
        if country == 'EN':
            country = 'US'

    ret_val['country'] = country

    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
    lang_country = '%s-%s' % (language, country)
    # subdomain ...

    # subdomain
    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')

    ret_val['params'] = {}
    ret_val['headers'] = {}
    # params & headers

    if _any_language and supported_any_language:
        # based on whoogle
        ret_val['params']['source'] = 'lnt'
    else:
        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
        ret_val['headers']['Accept-Language'] = ','.join([
            lang_country,
            language + ';q=0.8,',
            'en;q=0.6',
            '*;q=0.5',
        ])
    lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..)

    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

    ret_val['params']['hl'] = lang_list.get(lang_country, language)

    # lr parameter:
    #   The lr (language restrict) parameter restricts search results to
    #   documents written in a particular language.
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   Language Collection Values:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
        if lang_country in lang_list:
            ret_val['params']['lr'] = "lang_" + lang_country
        elif language in lang_country:
            ret_val['params']['lr'] = "lang_" + language

    if _any_language and supported_any_language:

        # interpretation is left up to Google (based on whoogle)
        #
        # - add parameter ``source=lnt``
        # - don't use parameter ``lr``
        # - don't add a ``Accept-Language`` HTTP header.

        ret_val['params']['source'] = 'lnt'

    else:
            ret_val['params']['lr'] = language

    ret_val['params']['hl'] = lang_country if lang_country in lang_list else language
        # restricts search results to documents written in a particular
        # language.
        ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)

    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
    return ret_val


def detect_google_sorry(resp):
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
    """Detect when ratelimited"""
    if resp.url == 'sorry.google.com' or resp.url.startswith('/sorry'):
        raise SearxEngineCaptchaException()


@@ -208,46 +252,47 @@ def request(query, params):

    offset = (params['pageno'] - 1) * 10

    lang_info = get_lang_info(
    # pylint: disable=undefined-variable
        params, supported_languages, language_aliases, True
    )
    lang_info = get_lang_info(params, supported_languages, language_aliases, True)

    additional_parameters = {}
    if use_mobile_ui:
        additional_parameters = {
            'async': 'use_ac:true,_fmt:pc',
            'asearch': 'arc',
            'async': 'use_ac:true,_fmt:prog',
        }

    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                **lang_info['params'],
                'ie': "utf8",
                'oe': "utf8",
                'start': offset,
                'filter': '0',
        'ucbcb': 1,
                **additional_parameters,
    })
            }
        )
    )

    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url

    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
    params['cookies']['CONSENT'] = "PENDING+" + str(random()*100)
    params['cookies']['CONSENT'] = "YES+"
    params['headers'].update(lang_info['headers'])
    if use_mobile_ui:
        params['headers']['Accept'] = '*/*'
    else:
        params['headers']['Accept'] = (
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        )
        params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'

    return params

@@ -261,7 +306,6 @@ def response(resp):

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
@@ -283,22 +327,18 @@ def response(resp):

    # parse results

    _results_xpath = results_xpath
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):
    for result in eval_xpath_list(dom, results_xpath):

        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            logger.debug("ignoring <g-section-with-header>")
            continue

        try:
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            if title_tag is None:
                # this not one of the common google results *section*
                logger.debug('ingoring item from the result_xpath list: missing title')
                logger.debug('ignoring item from the result_xpath list: missing title')
                continue
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
@@ -306,16 +346,11 @@ def response(resp):
                continue
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            if content is None:
                logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
                continue

            logger.debug('add link to results: %s', title)

            results.append({
                'url': url,
                'title': title,
                'content': content
            })
            results.append({'url': url, 'title': title, 'content': content})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
@@ -326,9 +361,6 @@ def response(resp):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results