Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1bc54fc4 authored by Nivesh Krishna's avatar Nivesh Krishna Committed by Arnau Vàzquez
Browse files

pull changes from searxng for google engine

parent ea784a02
Loading
Loading
Loading
Loading
+189 −157
Original line number Original line Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Web)
# lint: pylint
For detailed description of the *REST-full* API see: `Query Parameter
"""This is the implementation of the google WEB engine.  Some of this
Definitions`_.
implementations are shared by other engines:
.. _Query Parameter Definitions:

   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
- :ref:`google images engine`
"""
- :ref:`google news engine`
- :ref:`google videos engine`

The google WEB engine itself has a special setup option:

.. code:: yaml


# pylint: disable=invalid-name, missing-function-docstring, too-many-branches
  - name: google
    ...
    use_mobile_ui: false


from urllib.parse import urlencode, urlparse
``use_mobile_ui``: (default: ``false``)
from random import random
  Enables to use *mobile endpoint* to bypass the google blocking (see
  :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More
  results` is not affected by Google rate limiting and we can still do requests
  while actively blocked by the original Google search.  By activate
  ``use_mobile_ui`` this behavior is simulated by adding the parameter
  ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.

"""

from urllib.parse import urlencode
from lxml import html
from lxml import html
from searx import logger
from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -28,10 +44,11 @@ about = {
}
}


# engine dependent config
# engine dependent config
categories = ['general']
categories = ['general', 'web']
paging = True
paging = True
time_range_support = True
time_range_support = True
safesearch = True
safesearch = True
send_accept_language_header = True
use_mobile_ui = False
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'
supported_languages_url = 'https://www.google.com/preferences?#languages'


@@ -87,119 +104,146 @@ google_domains = {
    'UA': 'google.com.ua',  # Ukraine
    'UA': 'google.com.ua',  # Ukraine
    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'HK': 'google.com.hk',  # Hong Kong
    'HK': 'google.com.hk',  # Hong Kong
    'TW': 'google.com.tw'   # Taiwan
    'TW': 'google.com.tw',  # Taiwan
}
}


time_range_dict = {
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
    'day': 'd',
    'week': 'w',
    'month': 'm',
    'year': 'y'
}


# Filter results. 0: None, 1: Moderate, 2: Strict
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
    0: 'off',
    1: 'medium',
    2: 'high'
}


# specific xpath variables
# specific xpath variables
# ------------------------
# ------------------------


# google results are grouped into <div class="jtfYYd ..." ../>
results_xpath = './/div[@data-sokoban-container]'
results_xpath = '//div[contains(@class, "jtfYYd")]'
title_xpath = './/a/h3[1]'
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]'


# google *sections* are no usual *results*, we ignore them
# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
g_section_with_header = './g-section-with-header'


# the title is a h3 tag relative to the result group
title_xpath = './/h3[1]'

# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
# href=...>
href_xpath = './/div[@class="yuRUbf"]//a/@href'

# in the result group there is <div class="VwiC3b ..." ../> containing the *content*
content_xpath = './/div[contains(@class, "VwiC3b")]'


# Suggestions are links placed in a *card-section*, we extract only the text
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "card-section")]//a'
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'

# Since google does *auto-correction* on the first query these are not really
# *spelling suggestions*, we use them anyway.
spelling_suggestion_xpath = '//div[@class="med"]/p/a'




def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
    ret_val = {}
    """Composing various language properties for the google engines.

    This function is called by the various google engines (:ref:`google web
    engine`, :ref:`google images engine`, :ref:`google news engine` and
    :ref:`google videos engine`).

    :param dict param: request parameters of the engine

    :param list lang_list: list of supported languages of the engine
        :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`

    :param dict lang_list: custom aliases for non standard language codes
        (used when calling :py:func:`searx.utils.match_language`)

    :param bool supported_any_language: When a language is not specified, the
        language interpretation is left up to Google to decide how the search
        results should be delivered.  This argument is ``True`` for the google
        engine and ``False`` for the other engines (google-images, -news,
        -scholar, -videos).

    :rtype: dict
    :returns:
        Py-Dictionary with the key/value pairs:

        language:
            Return value from :py:func:`searx.utils.match_language`

        country:
            The country code (e.g. US, AT, CA, FR, DE ..)

        subdomain:
            Google subdomain :py:obj:`google_domains` that fits to the country
            code.

        params:
            Py-Dictionary with additional request arguments (can be passed to
            :py:func:`urllib.parse.urlencode`).

        headers:
            Py-Dictionary with additional HTTP headers (can be passed to
            request's headers)
    """
    ret_val = {
        'language': None,
        'country': None,
        'subdomain': None,
        'params': {},
        'headers': {},
    }

    # language ...


    _lang = params['language']
    _lang = params['language']
    _any_language = _lang.lower() == 'all'
    _any_language = _lang.lower() == 'all'
    if _any_language:
    if _any_language:
        _lang = 'en-US'
        _lang = 'en-US'

    language = match_language(_lang, lang_list, custom_aliases)
    language = match_language(_lang, lang_list, custom_aliases)
    ret_val['language'] = language
    ret_val['language'] = language


    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
    # country ...
    _l = _lang.split('-')


    # the country code (US, AT, CA)
    _l = _lang.split('-')
    if len(_l) == 2:
    if len(_l) == 2:
        country = _l[1]
        country = _l[1]
    else:
    else:
        country = _l[0].upper()
        country = _l[0].upper()
        if country == 'EN':
        if country == 'EN':
            country = 'US'
            country = 'US'

    ret_val['country'] = country
    ret_val['country'] = country


    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
    # subdomain ...
    lang_country = '%s-%s' % (language, country)


    # subdomain
    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')


    ret_val['params'] = {}
    # params & headers
    ret_val['headers'] = {}


    if _any_language and supported_any_language:
    lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
        # based on whoogle

        ret_val['params']['source'] = 'lnt'
    # hl parameter:
    else:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
    # Interface Language:
        ret_val['headers']['Accept-Language'] = ','.join([
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
            lang_country,

            language + ';q=0.8,',
    ret_val['params']['hl'] = lang_list.get(lang_country, language)
            'en;q=0.6',
            '*;q=0.5',
        ])


    # lr parameter:
    # lr parameter:
    #   The lr (language restrict) parameter restricts search results to
    #   documents written in a particular language.
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    #   Language Collection Values:
    #   Language Collection Values:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
        if lang_country in lang_list:

            ret_val['params']['lr'] = "lang_" + lang_country
    if _any_language and supported_any_language:
        elif language in lang_country:

            ret_val['params']['lr'] = "lang_" + language
        # interpretation is left up to Google (based on whoogle)
        #
        # - add parameter ``source=lnt``
        # - don't use parameter ``lr``
        # - don't add a ``Accept-Language`` HTTP header.

        ret_val['params']['source'] = 'lnt'

    else:
    else:
            ret_val['params']['lr'] = language


    ret_val['params']['hl'] = lang_country if lang_country in lang_list else language
        # restricts search results to documents written in a particular
        # language.
        ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)


    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
    return ret_val
    return ret_val



def detect_google_sorry(resp):
def detect_google_sorry(resp):
    resp_url = urlparse(resp.url)
    """Detect when ratelimited"""
    if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
    if resp.url == 'sorry.google.com' or resp.url.startswith('/sorry'):
        raise SearxEngineCaptchaException()
        raise SearxEngineCaptchaException()




@@ -208,46 +252,47 @@ def request(query, params):


    offset = (params['pageno'] - 1) * 10
    offset = (params['pageno'] - 1) * 10


    lang_info = get_lang_info(
    # pylint: disable=undefined-variable
    # pylint: disable=undefined-variable
        params, supported_languages, language_aliases, True
    lang_info = get_lang_info(params, supported_languages, language_aliases, True)
    )


    additional_parameters = {}
    additional_parameters = {}
    if use_mobile_ui:
    if use_mobile_ui:
        additional_parameters = {
        additional_parameters = {
            'async': 'use_ac:true,_fmt:pc',
            'asearch': 'arc',
            'async': 'use_ac:true,_fmt:prog',
        }
        }


    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                'q': query,
                **lang_info['params'],
                **lang_info['params'],
                'ie': "utf8",
                'ie': "utf8",
                'oe': "utf8",
                'oe': "utf8",
                'start': offset,
                'start': offset,
                'filter': '0',
                'filter': '0',
        'ucbcb': 1,
                **additional_parameters,
                **additional_parameters,
    })
            }
        )
    )


    if params['time_range'] in time_range_dict:
    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
    params['url'] = query_url


    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
    params['cookies']['CONSENT'] = "YES+"
    params['cookies']['CONSENT'] = "PENDING+" + str(random()*100)
    params['headers'].update(lang_info['headers'])
    params['headers'].update(lang_info['headers'])
    if use_mobile_ui:
    if use_mobile_ui:
        params['headers']['Accept'] = '*/*'
        params['headers']['Accept'] = '*/*'
    else:
    else:
        params['headers']['Accept'] = (
        params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        )


    return params
    return params


@@ -261,7 +306,6 @@ def response(resp):


    # convert the text to dom
    # convert the text to dom
    dom = html.fromstring(resp.text)
    dom = html.fromstring(resp.text)

    # results --> answer
    # results --> answer
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
    if answer_list:
    if answer_list:
@@ -283,22 +327,18 @@ def response(resp):


    # parse results
    # parse results


    _results_xpath = results_xpath
    for result in eval_xpath_list(dom, results_xpath):
    if use_mobile_ui:
        _results_xpath = results_xpath_mobile_ui

    for result in eval_xpath_list(dom, _results_xpath):


        # google *sections*
        # google *sections*
        if extract_text(eval_xpath(result, g_section_with_header)):
        if extract_text(eval_xpath(result, g_section_with_header)):
            logger.debug("ingoring <g-section-with-header>")
            logger.debug("ignoring <g-section-with-header>")
            continue
            continue


        try:
        try:
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
            if title_tag is None:
            if title_tag is None:
                # this not one of the common google results *section*
                # this not one of the common google results *section*
                logger.debug('ingoring item from the result_xpath list: missing title')
                logger.debug('ignoring item from the result_xpath list: missing title')
                continue
                continue
            title = extract_text(title_tag)
            title = extract_text(title_tag)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
            url = eval_xpath_getindex(result, href_xpath, 0, None)
@@ -306,16 +346,11 @@ def response(resp):
                continue
                continue
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            if content is None:
            if content is None:
                logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
                continue
                continue


            logger.debug('add link to results: %s', title)
            logger.debug('add link to results: %s', title)

            results.append({'url': url, 'title': title, 'content': content})
            results.append({
                'url': url,
                'title': title,
                'content': content
            })


        except Exception as e:  # pylint: disable=broad-except
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            logger.error(e, exc_info=True)
@@ -326,9 +361,6 @@ def response(resp):
        # append suggestion
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})
        results.append({'suggestion': extract_text(suggestion)})


    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    # return results
    return results
    return results