Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9ae025b6 authored by Arnau Vàzquez's avatar Arnau Vàzquez
Browse files

Merge branch 'dev/fix-google-engine' into 'master'

pull changes from searxng for google engine

See merge request !160
parents ea784a02 1bc54fc4
Loading
Loading
Loading
Loading
+189 −157
Original line number Original line Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Web)
# lint: pylint
For detailed description of the *REST-full* API see: `Query Parameter
"""This is the implementation of the google WEB engine.  Some of this
Definitions`_.
implementations are shared by other engines:
.. _Query Parameter Definitions:

   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
- :ref:`google images engine`
"""
- :ref:`google news engine`
- :ref:`google videos engine`

The google WEB engine itself has a special setup option:

.. code:: yaml


# pylint: disable=invalid-name, missing-function-docstring, too-many-branches
  - name: google
    ...
    use_mobile_ui: false


from urllib.parse import urlencode, urlparse
``use_mobile_ui``: (default: ``false``)
from random import random
  Enables to use *mobile endpoint* to bypass the google blocking (see
  :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More
  results` is not affected by Google rate limiting and we can still do requests
  while actively blocked by the original Google search.  By activate
  ``use_mobile_ui`` this behavior is simulated by adding the parameter
  ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.

"""

from urllib.parse import urlencode
from lxml import html
from lxml import html
from searx import logger
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -28,10 +44,11 @@ about = {
}
}


# engine dependent config
# engine dependent config
categories = ['general']
categories = ['general', 'web']
paging = True
paging = True
time_range_support = True
time_range_support = True
safesearch = True
safesearch = True
send_accept_language_header = True
use_mobile_ui = False
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'
supported_languages_url = 'https://www.google.com/preferences?#languages'


@@ -87,119 +104,146 @@ google_domains = {
    'UA': 'google.com.ua',  # Ukraine
    'UA': 'google.com.ua',  # Ukraine
    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'HK': 'google.com.hk',  # Hong Kong
    'HK': 'google.com.hk',  # Hong Kong
    'TW': 'google.com.tw'   # Taiwan
    'TW': 'google.com.tw',  # Taiwan
}
}


time_range_dict = {
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
    'day': 'd',
    'week': 'w',
    'month': 'm',
    'year': 'y'
}


# Filter results. 0: None, 1: Moderate, 2: Strict
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
    0: 'off',
    1: 'medium',
    2: 'high'
}


# specific xpath variables
# specific xpath variables
# ------------------------
# ------------------------


# google results are grouped into <div class="jtfYYd ..." ../>
results_xpath = './/div[@data-sokoban-container]'
results_xpath = '//div[contains(@class, "jtfYYd")]'
title_xpath = './/a/h3[1]'
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]'


# google *sections* are no usual *results*, we ignore them
# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
g_section_with_header = './g-section-with-header'


# the title is a h3 tag relative to the result group
title_xpath = './/h3[1]'

# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
# href=...>
href_xpath = './/div[@class="yuRUbf"]//a/@href'

# in the result group there is <div class="VwiC3b ..." ../> containing the *content*
content_xpath = './/div[contains(@class, "VwiC3b")]'


# Suggestions are links placed in a *card-section*, we extract only the text
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "card-section")]//a'
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'

# Since google does *auto-correction* on the first query these are not really
# *spelling suggestions*, we use them anyway.
spelling_suggestion_xpath = '//div[@class="med"]/p/a'




def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
    ret_val = {}
    """Composing various language properties for the google engines.

    This function is called by the various google engines (:ref:`google web
    engine`, :ref:`google images engine`, :ref:`google news engine` and
    :ref:`google videos engine`).

    :param dict param: request parameters of the engine

    :param list lang_list: list of supported languages of the engine
        :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`

    :param dict lang_list: custom aliases for non standard language codes
        (used when calling :py:func:`searx.utils.match_language`)

    :param bool supported_any_language: When a language is not specified, the
        language interpretation is left up to Google to decide how the search
        results should be delivered.  This argument is ``True`` for the google
        engine and ``False`` for the other engines (google-images, -news,
        -scholar, -videos).

    :rtype: dict
    :returns:
        Py-Dictionary with the key/value pairs:

        language:
            Return value from :py:func:`searx.utils.match_language`

        country:
            The country code (e.g. US, AT, CA, FR, DE ..)

        subdomain:
            Google subdomain :py:obj:`google_domains` that fits to the country
            code.

        params:
            Py-Dictionary with additional request arguments (can be passed to
            :py:func:`urllib.parse.urlencode`).

        headers:
            Py-Dictionary with additional HTTP headers (can be passed to
            request's headers)
    """
    ret_val = {
        'language': None,
        'country': None,
        'subdomain': None,
        'params': {},
        'headers': {},
    }

    # language ...


    _lang = params['language']
    _lang = params['language']
    _any_language = _lang.lower() == 'all'
    _any_language = _lang.lower() == 'all'
    if _any_language:
    if _any_language:
        _lang = 'en-US'
        _lang = 'en-US'

    language = match_language(_lang, lang_list, custom_aliases)
    language = match_language(_lang, lang_list, custom_aliases)
    ret_val['language'] = language
    ret_val['language'] = language


    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
    # country ...
    _l = _lang.split('-')


    # the country code (US, AT, CA)
    _l = _lang.split('-')
    if len(_l) == 2:
    if len(_l) == 2:
        country = _l[1]
        country = _l[1]
    else:
    else:
        country = _l[0].upper()
        country = _l[0].upper()
        if country == 'EN':
        if country == 'EN':
            country = 'US'
            country = 'US'

    ret_val['country'] = country
    ret_val['country'] = country


    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
    # subdomain ...
    lang_country = '%s-%s' % (language, country)


    # subdomain
    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')


    ret_val['params'] = {}
    # params & headers
    ret_val['headers'] = {}


    if _any_language and supported_any_language:
    lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
        # based on whoogle

        ret_val['params']['source'] = 'lnt'
    # hl parameter:
    else:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
    # Interface Language:
        ret_val['headers']['Accept-Language'] = ','.join([
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
            lang_country,

            language + ';q=0.8,',
    ret_val['params']['hl'] = lang_list.get(lang_country, language)
            'en;q=0.6',
            '*;q=0.5',
        ])


    # lr parameter:
    # lr parameter:
    #   The lr (language restrict) parameter restricts search results to
    #   documents written in a particular language.
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   Language Collection Values:
    #   Language Collection Values:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
        if lang_country in lang_list:

            ret_val['params']['lr'] = "lang_" + lang_country
    if _any_language and supported_any_language:
        elif language in lang_country:

            ret_val['params']['lr'] = "lang_" + language
        # interpretation is left up to Google (based on whoogle)
        #
        # - add parameter ``source=lnt``
        # - don't use parameter ``lr``
        # - don't add a ``Accept-Language`` HTTP header.

        ret_val['params']['source'] = 'lnt'

    else:
    else:
            ret_val['params']['lr'] = language


    ret_val['params']['hl'] = lang_country if lang_country in lang_list else language
        # restricts search results to documents written in a particular
        # language.
        ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)


    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
    return ret_val
    return ret_val



def detect_google_sorry(resp):
def detect_google_sorry(resp):
    resp_url = urlparse(resp.url)
    """Detect when ratelimited"""
    if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
    if resp.url == 'sorry.google.com' or resp.url.startswith('/sorry'):
        raise SearxEngineCaptchaException()
        raise SearxEngineCaptchaException()




@@ -208,46 +252,47 @@ def request(query, params):


    offset = (params['pageno'] - 1) * 10
    offset = (params['pageno'] - 1) * 10


    lang_info = get_lang_info(
    # pylint: disable=undefined-variable
    # pylint: disable=undefined-variable
        params, supported_languages, language_aliases, True
    lang_info = get_lang_info(params, supported_languages, language_aliases, True)
    )


    additional_parameters = {}
    additional_parameters = {}
    if use_mobile_ui:
    if use_mobile_ui:
        additional_parameters = {
        additional_parameters = {
            'async': 'use_ac:true,_fmt:pc',
            'asearch': 'arc',
            'async': 'use_ac:true,_fmt:prog',
        }
        }


    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                'q': query,
                **lang_info['params'],
                **lang_info['params'],
                'ie': "utf8",
                'ie': "utf8",
                'oe': "utf8",
                'oe': "utf8",
                'start': offset,
                'start': offset,
                'filter': '0',
                'filter': '0',
        'ucbcb': 1,
                **additional_parameters,
                **additional_parameters,
    })
            }
        )
    )


    if params['time_range'] in time_range_dict:
    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
    params['url'] = query_url


    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
    params['cookies']['CONSENT'] = "YES+"
    params['cookies']['CONSENT'] = "PENDING+" + str(random()*100)
    params['headers'].update(lang_info['headers'])
    params['headers'].update(lang_info['headers'])
    if use_mobile_ui:
    if use_mobile_ui:
        params['headers']['Accept'] = '*/*'
        params['headers']['Accept'] = '*/*'
    else:
    else:
        params['headers']['Accept'] = (
        params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        )


    return params
    return params


@@ -261,7 +306,6 @@ def response(resp):


    # convert the text to dom
    # convert the text to dom
    dom = html.fromstring(resp.text)
    dom = html.fromstring(resp.text)

    # results --> answer
    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
    if answer_list:
@@ -283,22 +327,18 @@ def response(resp):


    # parse results
    # parse results


    _results_xpath = results_xpath
    for result in eval_xpath_list(dom, results_xpath):
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):


        # google *sections*
        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            logger.debug("ignoring <g-section-with-header>")
            continue
            continue


        try:
        try:
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            if title_tag is None:
            if title_tag is None:
                # this not one of the common google results *section*
                # this not one of the common google results *section*
                logger.debug('ingoring item from the result_xpath list: missing title')
                logger.debug('ignoring item from the result_xpath list: missing title')
                continue
                continue
            title = extract_text(title_tag)
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
@@ -306,16 +346,11 @@ def response(resp):
                continue
                continue
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            if content is None:
            if content is None:
                logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
                continue
                continue


            logger.debug('add link to results: %s', title)
            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})
            results.append({
                'url': url,
                'title': title,
                'content': content
            })


        except Exception as e:  # pylint: disable=broad-except
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            logger.error(e, exc_info=True)
@@ -326,9 +361,6 @@ def response(resp):
        # append suggestion
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})
        results.append({'suggestion': extract_text(suggestion)})


    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    # return results
    return results
    return results