From e45395f5013bc6ac46f19c33ad82dd1a6e330dc9 Mon Sep 17 00:00:00 2001
From: Nivesh Krishna <nivesh@e.email>
Date: Wed, 16 Nov 2022 14:04:44 +0530
Subject: [PATCH 1/2] pull changes from searxng for google engine

---
 searx/engines/google.py | 346 ++++++++++++++++++++++------------------
 1 file changed, 189 insertions(+), 157 deletions(-)

diff --git a/searx/engines/google.py b/searx/engines/google.py
index c9bb983b7..a100067f0 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -1,15 +1,31 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""Google (Web)
-For detailed description of the *REST-full* API see: `Query Parameter
-Definitions`_.
-.. _Query Parameter Definitions:
-   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
-"""
+# lint: pylint
+"""This is the implementation of the google WEB engine.  Some of this
+implementations are shared by other engines:
+
+- :ref:`google images engine`
+- :ref:`google news engine`
+- :ref:`google videos engine`
+
+The google WEB engine itself has a special setup option:
+
+.. code:: yaml
 
-# pylint: disable=invalid-name, missing-function-docstring, too-many-branches
+  - name: google
+    ...
+    use_mobile_ui: false
+
+``use_mobile_ui``: (default: ``false``)
+  Enables to use *mobile endpoint* to bypass the google blocking (see
+  :issue:`159`).  On the mobile UI of Google Search, the button :guilabel:`More
+  results` is not affected by Google rate limiting and we can still do requests
+  while actively blocked by the original Google search.  By activate
+  ``use_mobile_ui`` this behavior is simulated by adding the parameter
+  ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.
+
+"""
 
-from urllib.parse import urlencode, urlparse
-from random import random
+from urllib.parse import urlencode
 from lxml import html
 from searx import logger
 from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
@@ -28,178 +44,205 @@ about = {
 }
 
 # engine dependent config
-categories = ['general']
+categories = ['general', 'web']
 paging = True
 time_range_support = True
 safesearch = True
+send_accept_language_header = True
 use_mobile_ui = False
 supported_languages_url = 'https://www.google.com/preferences?#languages'
 
 # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
 google_domains = {
-    'BG': 'google.bg',      # Bulgaria
-    'CZ': 'google.cz',      # Czech Republic
-    'DE': 'google.de',      # Germany
-    'DK': 'google.dk',      # Denmark
-    'AT': 'google.at',      # Austria
-    'CH': 'google.ch',      # Switzerland
-    'GR': 'google.gr',      # Greece
+    'BG': 'google.bg',  # Bulgaria
+    'CZ': 'google.cz',  # Czech Republic
+    'DE': 'google.de',  # Germany
+    'DK': 'google.dk',  # Denmark
+    'AT': 'google.at',  # Austria
+    'CH': 'google.ch',  # Switzerland
+    'GR': 'google.gr',  # Greece
     'AU': 'google.com.au',  # Australia
-    'CA': 'google.ca',      # Canada
-    'GB': 'google.co.uk',   # United Kingdom
-    'ID': 'google.co.id',   # Indonesia
-    'IE': 'google.ie',      # Ireland
-    'IN': 'google.co.in',   # India
+    'CA': 'google.ca',  # Canada
+    'GB': 'google.co.uk',  # United Kingdom
+    'ID': 'google.co.id',  # Indonesia
+    'IE': 'google.ie',  # Ireland
+    'IN': 'google.co.in',  # India
     'MY': 'google.com.my',  # Malaysia
-    'NZ': 'google.co.nz',   # New Zealand
+    'NZ': 'google.co.nz',  # New Zealand
     'PH': 'google.com.ph',  # Philippines
     'SG': 'google.com.sg',  # Singapore
-    'US': 'google.com',     # United States (google.us) redirects to .com
-    'ZA': 'google.co.za',   # South Africa
+    'US': 'google.com',  # United States (google.us) redirects to .com
+    'ZA': 'google.co.za',  # South Africa
     'AR': 'google.com.ar',  # Argentina
-    'CL': 'google.cl',      # Chile
-    'ES': 'google.es',      # Spain
+    'CL': 'google.cl',  # Chile
+    'ES': 'google.es',  # Spain
     'MX': 'google.com.mx',  # Mexico
-    'EE': 'google.ee',      # Estonia
-    'FI': 'google.fi',      # Finland
-    'BE': 'google.be',      # Belgium
-    'FR': 'google.fr',      # France
-    'IL': 'google.co.il',   # Israel
-    'HR': 'google.hr',      # Croatia
-    'HU': 'google.hu',      # Hungary
-    'IT': 'google.it',      # Italy
-    'JP': 'google.co.jp',   # Japan
-    'KR': 'google.co.kr',   # South Korea
-    'LT': 'google.lt',      # Lithuania
-    'LV': 'google.lv',      # Latvia
-    'NO': 'google.no',      # Norway
-    'NL': 'google.nl',      # Netherlands
-    'PL': 'google.pl',      # Poland
+    'EE': 'google.ee',  # Estonia
+    'FI': 'google.fi',  # Finland
+    'BE': 'google.be',  # Belgium
+    'FR': 'google.fr',  # France
+    'IL': 'google.co.il',  # Israel
+    'HR': 'google.hr',  # Croatia
+    'HU': 'google.hu',  # Hungary
+    'IT': 'google.it',  # Italy
+    'JP': 'google.co.jp',  # Japan
+    'KR': 'google.co.kr',  # South Korea
+    'LT': 'google.lt',  # Lithuania
+    'LV': 'google.lv',  # Latvia
+    'NO': 'google.no',  # Norway
+    'NL': 'google.nl',  # Netherlands
+    'PL': 'google.pl',  # Poland
     'BR': 'google.com.br',  # Brazil
-    'PT': 'google.pt',      # Portugal
-    'RO': 'google.ro',      # Romania
-    'RU': 'google.ru',      # Russia
-    'SK': 'google.sk',      # Slovakia
-    'SI': 'google.si',      # Slovenia
-    'SE': 'google.se',      # Sweden
-    'TH': 'google.co.th',   # Thailand
+    'PT': 'google.pt',  # Portugal
+    'RO': 'google.ro',  # Romania
+    'RU': 'google.ru',  # Russia
+    'SK': 'google.sk',  # Slovakia
+    'SI': 'google.si',  # Slovenia
+    'SE': 'google.se',  # Sweden
+    'TH': 'google.co.th',  # Thailand
     'TR': 'google.com.tr',  # Turkey
     'UA': 'google.com.ua',  # Ukraine
     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
     'HK': 'google.com.hk',  # Hong Kong
-    'TW': 'google.com.tw'   # Taiwan
+    'TW': 'google.com.tw',  # Taiwan
 }
 
-time_range_dict = {
-    'day': 'd',
-    'week': 'w',
-    'month': 'm',
-    'year': 'y'
-}
+time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
 
 # Filter results. 0: None, 1: Moderate, 2: Strict
-filter_mapping = {
-    0: 'off',
-    1: 'medium',
-    2: 'high'
-}
+filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
 
 # specific xpath variables
 # ------------------------
 
-# google results are grouped into <div class="jtfYYd ..." ../>
-results_xpath = '//div[contains(@class, "jtfYYd")]'
-results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
+results_xpath = './/div[@data-sokoban-container]'
+title_xpath = './/a/h3[1]'
+href_xpath = './/a[h3]/@href'
+content_xpath = './/div[@data-content-feature=1]'
 
 # google *sections* are no usual *results*, we ignore them
 g_section_with_header = './g-section-with-header'
 
-# the title is a h3 tag relative to the result group
-title_xpath = './/h3[1]'
-
-# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
-# href=...>
-href_xpath = './/div[@class="yuRUbf"]//a/@href'
-
-# in the result group there is <div class="VwiC3b ..." ../> containing the *content*
-content_xpath = './/div[contains(@class, "VwiC3b")]'
 
 # Suggestions are links placed in a *card-section*, we extract only the text
 # from the links not the links itself.
-suggestion_xpath = '//div[contains(@class, "card-section")]//a'
-
-# Since google does *auto-correction* on the first query these are not really
-# *spelling suggestions*, we use them anyway.
-spelling_suggestion_xpath = '//div[@class="med"]/p/a'
+suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
 
 
 def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
-    ret_val = {}
+    """Composing various language properties for the google engines.
+
+    This function is called by the various google engines (:ref:`google web
+    engine`, :ref:`google images engine`, :ref:`google news engine` and
+    :ref:`google videos engine`).
+
+    :param dict param: request parameters of the engine
+
+    :param list lang_list: list of supported languages of the engine
+        :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`
+
+    :param dict lang_list: custom aliases for non standard language codes
+        (used when calling :py:func:`searx.utils.match_language`)
+
+    :param bool supported_any_language: When a language is not specified, the
+        language interpretation is left up to Google to decide how the search
+        results should be delivered.  This argument is ``True`` for the google
+        engine and ``False`` for the other engines (google-images, -news,
+        -scholar, -videos).
+
+    :rtype: dict
+    :returns:
+        Py-Dictionary with the key/value pairs:
+
+        language:
+            Return value from :py:func:`searx.utils.match_language`
+
+        country:
+            The country code (e.g. US, AT, CA, FR, DE ..)
+
+        subdomain:
+            Google subdomain :py:obj:`google_domains` that fits to the country
+            code.
+
+        params:
+            Py-Dictionary with additional request arguments (can be passed to
+            :py:func:`urllib.parse.urlencode`).
+
+        headers:
+            Py-Dictionary with additional HTTP headers (can be passed to
+            request's headers)
+    """
+    ret_val = {
+        'language': None,
+        'country': None,
+        'subdomain': None,
+        'params': {},
+        'headers': {},
+    }
+
+    # language ...
 
     _lang = params['language']
     _any_language = _lang.lower() == 'all'
     if _any_language:
         _lang = 'en-US'
-
     language = match_language(_lang, lang_list, custom_aliases)
     ret_val['language'] = language
 
-    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
-    _l = _lang.split('-')
+    # country ...
 
-    # the country code (US, AT, CA)
+    _l = _lang.split('-')
     if len(_l) == 2:
         country = _l[1]
     else:
         country = _l[0].upper()
         if country == 'EN':
             country = 'US'
-
     ret_val['country'] = country
 
-    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
-    lang_country = '%s-%s' % (language, country)
+    # subdomain ...
 
-    # subdomain
-    ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com')
+    ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
 
-    ret_val['params'] = {}
-    ret_val['headers'] = {}
+    # params & headers
 
-    if _any_language and supported_any_language:
-        # based on whoogle
-        ret_val['params']['source'] = 'lnt'
-    else:
-        # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
-        ret_val['headers']['Accept-Language'] = ','.join([
-            lang_country,
-            language + ';q=0.8,',
-            'en;q=0.6',
-            '*;q=0.5',
-        ])
-
-        # lr parameter:
-        #   https://developers.google.com/custom-search/docs/xml_results#lrsp
-        # Language Collection Values:
-        #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
-        if lang_country in lang_list:
-            ret_val['params']['lr'] = "lang_" + lang_country
-        elif language in lang_country:
-            ret_val['params']['lr'] = "lang_" + language
-        else:
-            ret_val['params']['lr'] = language
-
-    ret_val['params']['hl'] = lang_country if lang_country in lang_list else language
+    lang_country = '%s-%s' % (language, country)  # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
 
     # hl parameter:
     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
     # Interface Language:
     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
+
+    ret_val['params']['hl'] = lang_list.get(lang_country, language)
+
+    # lr parameter:
+    #   The lr (language restrict) parameter restricts search results to
+    #   documents written in a particular language.
+    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
+    #   Language Collection Values:
+    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
+
+    if _any_language and supported_any_language:
+
+        # interpretation is left up to Google (based on whoogle)
+        #
+        # - add parameter ``source=lnt``
+        # - don't use parameter ``lr``
+        # - don't add a ``Accept-Language`` HTTP header.
+
+        ret_val['params']['source'] = 'lnt'
+
+    else:
+
+        # restricts search results to documents written in a particular
+        # language.
+        ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
+
     return ret_val
 
+
 def detect_google_sorry(resp):
-    resp_url = urlparse(resp.url)
-    if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
+    if resp.url == 'sorry.google.com' or resp.url.startswith('/sorry'):
         raise SearxEngineCaptchaException()
 
 
@@ -208,47 +251,49 @@ def request(query, params):
 
     offset = (params['pageno'] - 1) * 10
 
-    lang_info = get_lang_info(
-        # pylint: disable=undefined-variable
-        params, supported_languages, language_aliases, True
-    )
+    lang_info = get_lang_info(params, supported_languages, language_aliases, True)
 
     additional_parameters = {}
     if use_mobile_ui:
         additional_parameters = {
-            'async': 'use_ac:true,_fmt:pc',
+            'asearch': 'arc',
+            'async': 'use_ac:true,_fmt:prog',
         }
 
     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
-    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
-        'q': query,
-        **lang_info['params'],
-        'ie': "utf8",
-        'oe': "utf8",
-        'start': offset,
-        'filter': '0',
-        'ucbcb': 1,
-        **additional_parameters,
-    })
+    query_url = (
+        'https://'
+        + lang_info['subdomain']
+        + '/search'
+        + "?"
+        + urlencode(
+            {
+                'q': query,
+                **lang_info['params'],
+                'ie': "utf8",
+                'oe': "utf8",
+                'start': offset,
+                'filter': '0',
+                **additional_parameters,
+            }
+        )
+    )
 
     if params['time_range'] in time_range_dict:
         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
     if params['safesearch']:
         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
-
-    logger.debug("query_url --> %s", query_url)
     params['url'] = query_url
 
-    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
-    params['cookies']['CONSENT'] = "PENDING+" + str(random()*100)
+    params['cookies']['CONSENT'] = "YES+"
     params['headers'].update(lang_info['headers'])
     if use_mobile_ui:
         params['headers']['Accept'] = '*/*'
     else:
-        params['headers']['Accept'] = (
-            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
-        )
+        params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
 
+    logger.debug(params)
+    logger.debug(query_url)
     return params
 
 
@@ -261,7 +306,6 @@ def response(resp):
 
     # convert the text to dom
     dom = html.fromstring(resp.text)
-
     # results --> answer
     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
     if answer_list:
@@ -270,7 +314,7 @@ def response(resp):
     else:
         logger.debug("did not find 'answer'")
 
-    # results --> number_of_results
+        # results --> number_of_results
         if not use_mobile_ui:
             try:
                 _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
@@ -283,22 +327,18 @@ def response(resp):
 
     # parse results
 
-    _results_xpath = results_xpath
-    if use_mobile_ui:
-        _results_xpath = results_xpath_mobile_ui
-
-    for result in eval_xpath_list(dom, _results_xpath):
+    for result in eval_xpath_list(dom, results_xpath):
 
         # google *sections*
         if extract_text(eval_xpath(result, g_section_with_header)):
-            logger.debug("ingoring <g-section-with-header>")
+            logger.debug("ignoring <g-section-with-header>")
             continue
 
         try:
             title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
             if title_tag is None:
                 # this not one of the common google results *section*
-                logger.debug('ingoring item from the result_xpath list: missing title')
+                logger.debug('ignoring item from the result_xpath list: missing title')
                 continue
             title = extract_text(title_tag)
             url = eval_xpath_getindex(result, href_xpath, 0, None)
@@ -306,16 +346,11 @@ def response(resp):
                 continue
             content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
             if content is None:
-                logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
+                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
                 continue
 
             logger.debug('add link to results: %s', title)
-
-            results.append({
-                'url': url,
-                'title': title,
-                'content': content
-            })
+            results.append({'url': url, 'title': title, 'content': content})
 
         except Exception as e:  # pylint: disable=broad-except
             logger.error(e, exc_info=True)
@@ -326,9 +361,6 @@ def response(resp):
         # append suggestion
         results.append({'suggestion': extract_text(suggestion)})
 
-    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
-        results.append({'correction': extract_text(correction)})
-
     # return results
     return results
 
-- 
GitLab


From 5718ee31e3e138a89bb2d2179e2e7f34e077ab6e Mon Sep 17 00:00:00 2001
From: Nivesh Krishna <nivesh@e.email>
Date: Wed, 16 Nov 2022 14:16:36 +0530
Subject: [PATCH 2/2] lint fixesg

---
 searx/engines/google.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/searx/engines/google.py b/searx/engines/google.py
index a100067f0..3e07ee1ad 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -242,6 +242,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
 
 
 def detect_google_sorry(resp):
+    """Detect when ratelimited"""
     if resp.url == 'sorry.google.com' or resp.url.startswith('/sorry'):
         raise SearxEngineCaptchaException()
 
@@ -251,6 +252,7 @@ def request(query, params):
 
     offset = (params['pageno'] - 1) * 10
 
+    # pylint: disable=undefined-variable
     lang_info = get_lang_info(params, supported_languages, language_aliases, True)
 
     additional_parameters = {}
@@ -292,8 +294,6 @@ def request(query, params):
     else:
         params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
 
-    logger.debug(params)
-    logger.debug(query_url)
     return params
 
 
-- 
GitLab