[mod] Google: reversed engineered & upgrade to data_type: traits_v1 (24998995) · Commits · e / infra / spot

docs/src/searx.engines.google.rst

+24 −3

Original line number	Diff line number	Diff line
		@@ -12,15 +12,21 @@ Google Engines

		.. _google API:

		google API
		Google API
		==========

		.. _Query Parameter Definitions:
		https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions

		SearXNG's implementation of the Google API is mainly done in
		:py:obj:`get_google_info <searx.engines.google.get_google_info>`.

		For detailed description of the REST-full API see: `Query Parameter
		Definitions`_. Not all parameters can be appied and some engines are special
		(e.g. :ref:`google news engine`).
		Definitions`_. The linked API documentation can sometimes be helpful during
		reverse engineering. However, we cannot use it in the freely accessible WEB
		services; not all parameters can be applied and some engines are more special
		than other (e.g. :ref:`google news engine`).


		.. _google web engine:

		@@ -30,6 +36,13 @@ Google WEB
		.. automodule:: searx.engines.google
		:members:

		.. _google autocomplete:

		Google Autocomplete
		====================

		.. autofunction:: searx.autocomplete.google_complete

		.. _google images engine:

		Google Images
		@@ -53,3 +66,11 @@ Google News

		.. automodule:: searx.engines.google_news
		:members:

		.. _google scholar engine:

		Google Scholar
		==============

		.. automodule:: searx.engines.google_scholar
		:members:

searx/autocomplete.py

+32 −15

Original line number	Diff line number	Diff line
		@@ -5,14 +5,17 @@
		"""
		# pylint: disable=use-dict-literal

		from json import loads
		import json
		from urllib.parse import urlencode

		from lxml import etree
		import lxml
		from httpx import HTTPError

		from searx import settings
		from searx.engines import engines
		from searx.engines import (
		engines,
		google,
		)
		from searx.network import get as http_get
		from searx.exceptions import SearxEngineResponseException

		@@ -55,7 +58,7 @@ def dbpedia(query, _lang):
		results = []

		if response.ok:
		dom = etree.fromstring(response.content)
		dom = lxml.etree.fromstring(response.content)
		results = dom.xpath('//Result/Label//text()')

		return results
		@@ -81,18 +84,32 @@ def duckduckgo(query, sxng_locale):
		return ret_val


		def google(query, lang):
		# google autocompleter
		autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
		def google_complete(query, sxng_locale):
		"""Autocomplete from Google. Supports Google's languages and subdomains
		(:py:obj:`searx.engines.google.get_google_info`) by using the async REST
		API::

		response = get(autocomplete_url + urlencode(dict(hl=lang, q=query)))
		https://{subdomain}/complete/search?{args}

		results = []
		"""

		if response.ok:
		dom = etree.fromstring(response.text)
		results = dom.xpath('//suggestion/@data')
		google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)

		url = 'https://{subdomain}/complete/search?{args}'
		args = urlencode(
		{
		'q': query,
		'client': 'gws-wiz',
		'hl': google_info['params']['hl'],
		}
		)
		results = []
		resp = get(url.format(subdomain=google_info['subdomain'], args=args))
		if resp.ok:
		json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
		data = json.loads(json_txt)
		for item in data[0]:
		results.append(lxml.html.fromstring(item[0]).text_content())
		return results


		@@ -132,7 +149,7 @@ def swisscows(query, _lang):
		# swisscows autocompleter
		url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'

		resp = loads(get(url.format(query=urlencode({'query': query}))).text)
		resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
		return resp


		@@ -184,7 +201,7 @@ def yandex(query, _lang):
		# yandex autocompleter
		url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"

		resp = loads(get(url.format(urlencode(dict(part=query)))).text)
		resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
		if len(resp) > 1:
		return resp[1]
		return []
		@@ -193,7 +210,7 @@ def yandex(query, _lang):
		backends = {
		'dbpedia': dbpedia,
		'duckduckgo': duckduckgo,
		'google': google,
		'google': google_complete,
		'seznam': seznam,
		'startpage': startpage,
		'swisscows': swisscows,

searx/data/engine_traits.json

+1888 −1725

File changed.

Preview size limit exceeded, changes collapsed.

searx/engines/google.py

+244 −196

File changed.

Preview size limit exceeded, changes collapsed.

searx/engines/google_images.py

+28 −21

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: AGPL-3.0-or-later
		# lint: pylint
		"""This is the implementation of the google images engine using the google
		internal API used the Google Go Android app.
		"""This is the implementation of the Google Images engine using the internal
		Google API used by the Google Go Android app.

		This internal API offer results in

		- JSON (_fmt:json)
		- Protobuf (_fmt:pb)
		- Protobuf compressed? (_fmt:pc)
		- HTML (_fmt:html)
		- Protobuf encoded in JSON (_fmt:jspb).
		- JSON (``_fmt:json``)
		- Protobuf_ (``_fmt:pb``)
		- Protobuf_ compressed? (``_fmt:pc``)
		- HTML (``_fmt:html``)
		- Protobuf_ encoded in JSON (``_fmt:jspb``).

		.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
		"""

		from typing import TYPE_CHECKING

		from urllib.parse import urlencode
		from json import loads

		from searx.engines.google import fetch_traits # pylint: disable=unused-import
		from searx.engines.google import (
		get_lang_info,
		get_google_info,
		time_range_dict,
		detect_google_sorry,
		)

		# pylint: disable=unused-import
		from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits
		if TYPE_CHECKING:
		import logging
		from searx.enginelib.traits import EngineTraits

		logger: logging.Logger
		traits: EngineTraits

		# pylint: enable=unused-import

		# about
		about = {
		@@ -40,7 +47,6 @@ about = {
		# engine dependent config
		categories = ['images', 'web']
		paging = True
		use_locale_domain = True
		time_range_support = True
		safesearch = True
		send_accept_language_header = True
		@@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
		def request(query, params):
		"""Google-Image search request"""

		lang_info = get_lang_info(params, supported_languages, language_aliases, False)
		google_info = get_google_info(params, traits)

		query_url = (
		'https://'
		+ lang_info['subdomain']
		+ google_info['subdomain']
		+ '/search'
		+ "?"
		+ urlencode(
		{
		'q': query,
		'tbm': "isch",
		**lang_info['params'],
		'ie': "utf8",
		'oe': "utf8",
		**google_info['params'],
		'asearch': 'isch',
		'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
		}
		@@ -77,9 +81,8 @@ def request(query, params):
		query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
		params['url'] = query_url

		params['headers'].update(lang_info['headers'])
		params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
		params['headers']['Accept'] = '/'
		params['cookies'] = google_info['cookies']
		params['headers'].update(google_info['headers'])
		return params


		@@ -111,7 +114,11 @@ def response(resp):

		copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
		if copyright_notice:
		result_item['source'] += ' / ' + copyright_notice
		result_item['source'] += ' \| ' + copyright_notice

		freshness_date = item["result"].get("freshness_date")
		if freshness_date:
		result_item['source'] += ' \| ' + freshness_date

		file_size = item.get('gsa', {}).get('file_size')
		if file_size: