Merge branch '83-update-google-engine' into 'master' (dd485f0b) · Commits · e / infra / spot

searx/engines/google.py

+47 −96

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: AGPL-3.0-or-later
		"""Google (Web)

		For detailed description of the REST-full API see: `Query Parameter
		Definitions`_.

		.. _Query Parameter Definitions:
		https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
		"""

		# pylint: disable=invalid-name, missing-function-docstring
		# pylint: disable=invalid-name, missing-function-docstring, too-many-branches

		from urllib.parse import urlencode
		from lxml import html
		@@ -108,8 +106,9 @@ filter_mapping = {
		# specific xpath variables
		# ------------------------

		# google results are grouped into <div class="g" ../>
		results_xpath = '//div[@class="g"]'
		# google results are grouped into <div class="g ..." ../>
		results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]'
		results_xpath_mobile_ui = '//div[contains(@class, "g ")]'

		# google sections are no usual results, we ignore them
		g_section_with_header = './g-section-with-header'
		@@ -121,8 +120,8 @@ title_xpath = './/h3[1]'
		# href=...>
		href_xpath = './/div[@class="yuRUbf"]//a/@href'

		# in the result group there is <div class="IsZvec" ../> containing he content
		content_xpath = './/div[@class="IsZvec"]'
		# in the result group there is <div class="VwiC3b ..." ../> containing the content
		content_xpath = './/div[contains(@class, "VwiC3b")]'

		# Suggestions are links placed in a card-section, we extract only the text
		# from the links not the links itself.
		@@ -134,113 +133,42 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a'


		def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
		"""Composing various language properties for the google engines.

		This function is called by the various google engines (google itself,
		google-images, -news, -scholar, -videos).

		:param dict param: request parameters of the engine

		:param list lang_list: list of supported languages of the engine
		:py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`

		:param dict lang_list: custom aliases for non standard language codes
		(used when calling :py:func:`searx.utils.match_language)

		:param bool supported_any_language: When a language is not specified, the
		language interpretation is left up to Google to decide how the search
		results should be delivered. This argument is ``True`` for the google
		engine and ``False`` for the other engines (google-images, -news,
		-scholar, -videos).

		:rtype: dict
		:returns:
		Py-Dictionary with the key/value pairs:

		language:
		Return value from :py:func:`searx.utils.match_language

		country:
		The country code (e.g. US, AT, CA, FR, DE ..)

		subdomain:
		Google subdomain :py:obj:`google_domains` that fits to the country
		code.

		params:
		Py-Dictionary with additional request arguments (can be passed to
		:py:func:`urllib.parse.urlencode`).

		headers:
		Py-Dictionary with additional HTTP headers (can be passed to
		request's headers)
		"""
		ret_val = {
		'language' : None,
		'country' : None,
		'subdomain' : None,
		'params' : {},
		'headers' : {},
		}

		# language ...
		ret_val = {}

		_lang = params['language']
		_any_language = _lang.lower() == 'all'
		if _any_language:
		_lang = 'en-US'

		language = match_language(_lang, lang_list, custom_aliases)
		ret_val['language'] = language

		# country ...

		# the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
		_l = _lang.split('-')

		# the country code (US, AT, CA)
		if len(_l) == 2:
		country = _l[1]
		else:
		country = _l[0].upper()
		if country == 'EN':
		country = 'US'

		ret_val['country'] = country

		# subdomain ...
		# the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
		lang_country = '%s-%s' % (language, country)

		# subdomain
		ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')

		# params & headers

		lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..)

		# hl parameter:
		# https://developers.google.com/custom-search/docs/xml_results#hlsp The
		# Interface Language:
		# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

		ret_val['params']['hl'] = lang_list.get(lang_country, language)

		# lr parameter:
		# The lr (language restrict) parameter restricts search results to
		# documents written in a particular language.
		# https://developers.google.com/custom-search/docs/xml_results#lrsp
		# Language Collection Values:
		# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
		ret_val['params'] = {}
		ret_val['headers'] = {}

		if _any_language and supported_any_language:

		# interpretation is left up to Google (based on whoogle)
		#
		# - add parameter ``source=lnt``
		# - don't use parameter ``lr``
		# - don't add a ``Accept-Language`` HTTP header.

		# based on whoogle
		ret_val['params']['source'] = 'lnt'

		else:

		# restricts search results to documents written in a particular
		# language.
		ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language

		# Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
		ret_val['headers']['Accept-Language'] = ','.join([
		lang_country,
		@@ -249,6 +177,18 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
		'*;q=0.5',
		])

		# lr parameter:
		# https://developers.google.com/custom-search/docs/xml_results#lrsp
		# Language Collection Values:
		# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
		ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language

		ret_val['params']['hl'] = lang_country if lang_country in lang_list else language

		# hl parameter:
		# https://developers.google.com/custom-search/docs/xml_results#hlsp The
		# Interface Language:
		# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
		return ret_val

		def detect_google_sorry(resp):
		@@ -287,8 +227,11 @@ def request(query, params):
		query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
		if params['safesearch']:
		query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

		logger.debug("query_url --> %s", query_url)
		params['url'] = query_url

		logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
		params['headers'].update(lang_info['headers'])
		if use_mobile_ui:
		params['headers']['Accept'] = '/'
		@@ -330,7 +273,12 @@ def response(resp):
		logger.error(e, exc_info=True)

		# parse results
		for result in eval_xpath_list(dom, results_xpath):

		_results_xpath = results_xpath
		if use_mobile_ui:
		_results_xpath = results_xpath_mobile_ui

		for result in eval_xpath_list(dom, _results_xpath):

		# google sections
		if extract_text(eval_xpath(result, g_section_with_header)):
		@@ -341,24 +289,27 @@ def response(resp):
		title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
		if title_tag is None:
		# this not one of the common google results section
		logger.debug('ingoring <div class="g" ../> section: missing title')
		logger.debug('ingoring item from the result_xpath list: missing title')
		continue
		title = extract_text(title_tag)
		url = eval_xpath_getindex(result, href_xpath, 0, None)
		if url is None:
		continue
		content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
		if content is None:
		logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
		continue

		logger.debug('add link to results: %s', title)

		results.append({
		'url': url,
		'title': title,
		'content': content
		})

		except Exception as e: # pylint: disable=broad-except
		logger.error(e, exc_info=True)
		# from lxml import etree
		# logger.debug(etree.tostring(result, pretty_print=True))
		# import pdb
		# pdb.set_trace()
		continue

		# parse suggestion

searx/engines/google_scholar.py

+27 −20

Original line number	Diff line number	Diff line
		@@ -82,27 +82,32 @@ def request(query, params):

		params, supported_languages, language_aliases, False
		)
		logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])

		# subdomain is: scholar.google.xy
		lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

		query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({
		query_url = (
		'https://'
		+ lang_info['subdomain']
		+ '/scholar'
		+ "?"
		+ urlencode(
		{
		'q': query,
		**lang_info['params'],
		'ie': "utf8",
		'oe': "utf8",
		'start': offset,
		})
		}
		)
		)

		query_url += time_range_url(params)

		logger.debug("query_url --> %s", query_url)
		params['url'] = query_url

		logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
		params['headers'].update(lang_info['headers'])
		params['headers']['Accept'] = (
		'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
		)
		params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'

		# params['google_subdomain'] = subdomain
		return params
		@@ -139,11 +144,13 @@ def response(resp):
		if pub_type:
		title = title + " " + pub_type

		results.append({
		results.append(
		{
		'url': url,
		'title': title,
		'content': content,
		})
		}
		)

		# parse suggestion
		for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):

searx/engines/google_videos.py

+54 −50

Original line number	Diff line number	Diff line
		@@ -36,13 +36,9 @@ from searx.engines.google import (
		get_lang_info,
		time_range_dict,
		filter_mapping,
		results_xpath,
		g_section_with_header,
		title_xpath,
		href_xpath,
		content_xpath,
		suggestion_xpath,
		spelling_suggestion_xpath,
		detect_google_sorry,
		)

		@@ -53,6 +49,7 @@ from searx.engines.google import (
		)
		# pylint: enable=unused-import

		logger = logger.getChild('google videos')
		# about
		about = {
		"website": 'https://www.google.com',
		@@ -63,11 +60,9 @@ about = {
		"results": 'HTML',
		}

		logger = logger.getChild('google video')

		# engine dependent config

		categories = ['videos']
		categories = ['videos', 'web']
		paging = False
		language_support = True
		use_locale_domain = True
		@@ -76,16 +71,32 @@ safesearch = True

		RE_CACHE = {}


		def _re(regexpr):
		"""returns compiled regular expression"""
		RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
		return RE_CACHE[regexpr]


		def scrap_out_thumbs_src(dom):
		ret_val = {}
		thumb_name = 'dimg_'
		for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
		_script = script.text
		# "dimg_35":"https://i.ytimg.c....",
		_dimurl = _re("s='([^']*)").findall(_script)
		for k, v in _re('(' + thumb_name + '[0-9])":"(http[^"])').findall(_script):
		v = v.replace(r'\u003d', '=')
		v = v.replace(r'\u0026', '&')
		ret_val[k] = v
		logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
		return ret_val


		def scrap_out_thumbs(dom):
		"""Scrap out thumbnail data from <script> tags.
		"""
		"""Scrap out thumbnail data from <script> tags."""
		ret_val = {}
		thumb_name = 'vidthumb'
		thumb_name = 'dimg_'

		for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
		_script = script.text
		@@ -95,20 +106,11 @@ def scrap_out_thumbs(dom):
		if not _imgdata:
		continue

		# var ii=['vidthumb4','vidthumb7']
		# var ii=['dimg_17']
		for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
		# At least the equal sign in the URL needs to be decoded
		ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")

		# {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
		for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
		_script = script.text
		for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
		match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
		if match:
		# At least the equal sign in the URL needs to be decoded
		ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")

		logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
		return ret_val

		@@ -121,27 +123,30 @@ def request(query, params):
		params, supported_languages, language_aliases, False
		)

		query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
		query_url = (
		'https://'
		+ lang_info['subdomain']
		+ '/search'
		+ "?"
		+ urlencode(
		{
		'q': query,
		'tbm': "vid",
		**lang_info['params'],
		'ie': "utf8",
		'oe': "utf8",
		})
		}
		)
		)

		if params['time_range'] in time_range_dict:
		query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
		if params['safesearch']:
		query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

		logger.debug("query_url --> %s", query_url)
		params['url'] = query_url

		logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
		params['headers'].update(lang_info['headers'])
		params['headers']['Accept'] = (
		'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
		)
		params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
		return params


		@@ -154,31 +159,33 @@ def response(resp):
		# convert the text to dom
		dom = html.fromstring(resp.text)
		vidthumb_imgdata = scrap_out_thumbs(dom)
		thumbs_src = scrap_out_thumbs_src(dom)
		logger.debug(str(thumbs_src))

		# parse results
		for result in eval_xpath_list(dom, results_xpath):
		for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):

		# google sections
		# ignore google sections
		if extract_text(eval_xpath(result, g_section_with_header)):
		logger.debug("ingoring <g-section-with-header>")
		continue

		title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
		url = eval_xpath_getindex(result, href_xpath, 0)
		c_node = eval_xpath_getindex(result, content_xpath, 0)

		# <img id="vidthumb1" ...>
		img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
		# ingnore articles without an image id / e.g. news articles
		img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
		if img_id is None:
		logger.error("no img_id found in item %s (news article?)", len(results) + 1)
		continue

		img_src = vidthumb_imgdata.get(img_id, None)
		if not img_src:
		logger.error("no vidthumb imgdata for: %s" % img_id)
		img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
		img_src = thumbs_src.get(img_id, "")

		length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
		content = extract_text(eval_xpath(c_node, './/div[2]/span'))
		pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
		title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
		url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
		length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span'))
		c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
		content = extract_text(c_node)
		pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))

		results.append({
		'url': url,
		@@ -195,7 +202,4 @@ def response(resp):
		# append suggestion
		results.append({'suggestion': extract_text(suggestion)})

		for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
		results.append({'correction': extract_text(correction)})

		return results