fix google images (3af143b8) · Commits · e / infra / spot

searx/engines/google_images.py

+69 −158

Original line number	Original line	Diff line number	Diff line
	# SPDX-License-Identifier: AGPL-3.0-or-later		# SPDX-License-Identifier: AGPL-3.0-or-later
	"""Google (Images)		# lint: pylint
			"""This is the implementation of the google images engine using the google
			internal API used the Google Go Android app.

	For detailed description of the REST-full API see: `Query Parameter		This internal API offer results in
	Definitions`_.

	.. _admonition:: Content-Security-Policy (CSP)		- JSON (_fmt:json)
			- Protobuf (_fmt:pb)
			- Protobuf compressed? (_fmt:pc)
			- HTML (_fmt:html)
			- Protobuf encoded in JSON (_fmt:jspb).

	This engine needs to allow images from the `data URLs`_ (prefixed with the
	``data:` scheme).::

	Header set Content-Security-Policy "img-src 'self' data: ;"

	.. _Query Parameter Definitions:
	https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
	.. _data URLs:
	https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
	"""		"""

	from urllib.parse import urlencode, unquote		from urllib.parse import urlencode
	from lxml import html		from json import loads

	from searx import logger
	from searx.utils import (
	eval_xpath,
	eval_xpath_list,
	eval_xpath_getindex,
	extract_text,
	)

	from searx.engines.google import (		from searx.engines.google import (
	get_lang_info,		get_lang_info,
	@@ -35,13 +23,9 @@ from searx.engines.google import (
	)		)

	# pylint: disable=unused-import		# pylint: disable=unused-import
	from searx.engines.google import (		from searx.engines.google import supported_languages_url, _fetch_supported_languages
	supported_languages_url
	, _fetch_supported_languages
	)
	# pylint: enable=unused-import

	logger = logger.getChild('google images')		# pylint: enable=unused-import

	# about		# about
	about = {		about = {
	@@ -50,83 +34,52 @@ about = {
	"official_api_documentation": 'https://developers.google.com/custom-search',		"official_api_documentation": 'https://developers.google.com/custom-search',
	"use_official_api": False,		"use_official_api": False,
	"require_api_key": False,		"require_api_key": False,
	"results": 'HTML',		"results": 'JSON',
	}		}

	# engine dependent config		# engine dependent config
	categories = ['images']		categories = ['images', 'web']
	paging = False		paging = True
	use_locale_domain = True		use_locale_domain = True
	time_range_support = True		time_range_support = True
	safesearch = True		safesearch = True
			send_accept_language_header = True

	filter_mapping = {		filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
	0: 'images',
	1: 'active',
	2: 'active'
	}


	def scrap_out_thumbs(dom):
	"""Scrap out thumbnail data from <script> tags.
	"""
	ret_val = {}
	for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'):
	_script = script.text
	# _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....');
	_thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1)
	_thumb_no = _thumb_no.replace("'", "")
	_img_data = _img_data.replace("'", "")
	_img_data = _img_data.replace(r"\/", r"/")
	ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=")
	return ret_val


	def scrap_img_by_id(script, data_id):
	"""Get full image URL by data-id in parent element
	"""
	img_url = ''
	_script = script.split('\n')
	for i, line in enumerate(_script):
	if 'gstatic.com/images' in line and data_id in line and i + 1 < len(_script):
	url_line = _script[i + 1]
	img_url = url_line.split('"')[1]
	img_url = unquote(img_url.replace(r'\u00', r'%'))
	return img_url


	def request(query, params):		def request(query, params):
	"""Google-Video search request"""		"""Google-Image search request"""

	lang_info = get_lang_info(		lang_info = get_lang_info(params, supported_languages, language_aliases, False)
	# pylint: disable=undefined-variable
	params, supported_languages, language_aliases, False
	)

	query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({		query_url = (
			'https://'
			+ lang_info['subdomain']
			+ '/search'
			+ "?"
			+ urlencode(
			{
	'q': query,		'q': query,
	'tbm': "isch",		'tbm': "isch",
	**lang_info['params'],		**lang_info['params'],
	'ie': "utf8",		'ie': "utf8",
	'oe': "utf8",		'oe': "utf8",
	'ucbcd': 1,		'asearch': 'isch',
	'num': 30,		'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
	})		}
			)
			)

	if params['time_range'] in time_range_dict:		if params['time_range'] in time_range_dict:
	query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})		query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
	if params['safesearch']:		if params['safesearch']:
	query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})		query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

	logger.debug("query_url --> %s", query_url)
	params['url'] = query_url		params['url'] = query_url

	logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
	params['cookies']['CONSENT'] = "YES+"
	params['headers'].update(lang_info['headers'])		params['headers'].update(lang_info['headers'])
	params['headers']['Accept'] = (		params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
	'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'		params['headers']['Accept'] = '/'
	)
	return params		return params


	@@ -136,76 +89,34 @@ def response(resp):

	detect_google_sorry(resp)		detect_google_sorry(resp)

	# convert the text to dom		json_start = resp.text.find('{"ischj":')
	dom = html.fromstring(resp.text)		json_data = loads(resp.text[json_start:])
	img_bas64_map = scrap_out_thumbs(dom)
	img_src_script = eval_xpath_getindex(		for item in json_data["ischj"]["metadata"]:
	dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
			result_item = {
	# parse results		'url': item["result"]["referrer_url"],
	#		'title': item["result"]["page_title"],
	# root element::		'content': item["text_in_grid"]["snippet"],
	# <div id="islmp" ..>		'source': item["result"]["site_title"],
	# result div per image::		'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
	# <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."		'img_src': item["original_image"]["url"],
	# The data-id matches to a item in a json-data structure in::		'thumbnail_src': item["thumbnail"]["url"],
	# <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...		'template': 'images.html',
	# In this structure the link to the origin PNG, JPG or whatever is given		}
	# first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
	# <img class="rg_i Q4LuWd" data-iid="0"		author = item["result"].get('iptc', {}).get('creator')
	# second link per image-div is the target link::		if author:
	# <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">		result_item['author'] = ', '.join(author)
	# the second link also contains two div tags with the description and publisher::
	# <div class="WGvvNb">The Sacrament of the Last Supper ...</div>		copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
	# <div class="fxgdke">en.wikipedia.org</div>		if copyright_notice:
			result_item['source'] += ' / ' + copyright_notice
	root = eval_xpath(dom, '//div[@id="islmp"]')
	if not root:		file_size = item.get('gsa', {}).get('file_size')
	logger.error("did not find root element id='islmp'")		if file_size:
	return results		result_item['source'] += ' (%s)' % file_size

	root = root[0]		results.append(result_item)
	for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

	img_alt = eval_xpath_getindex(img_node, '@alt', 0)

	img_base64_id = eval_xpath(img_node, '@data-iid')
	if img_base64_id:
	img_base64_id = img_base64_id[0]
	thumbnail_src = img_bas64_map[img_base64_id]
	else:
	thumbnail_src = eval_xpath(img_node, '@src')
	if not thumbnail_src:
	thumbnail_src = eval_xpath(img_node, '@data-src')
	if thumbnail_src:
	thumbnail_src = thumbnail_src[0]
	else:
	thumbnail_src = ''

	link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
	url = eval_xpath_getindex(link_node, '@href', 0)

	pub_nodes = eval_xpath(link_node, './div/div')
	pub_descr = img_alt
	pub_source = ''
	if pub_nodes:
	pub_descr = extract_text(pub_nodes[0])
	pub_source = extract_text(pub_nodes[1])

	img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
	src_url = scrap_img_by_id(img_src_script, img_src_id)
	if not src_url:
	src_url = thumbnail_src

	results.append({
	'url': url,
	'title': img_alt,
	'content': pub_descr,
	'source': pub_source,
	'img_src': src_url,
	# 'img_format': img_format,
	'thumbnail_src': thumbnail_src,
	'template': 'images.html'
	})

	return results		return results