Merge pull request #1456 from return42/fix-1449 (39d0156f) · Commits · e / infra / spot

searx/engines/tineye.py

+153 −31

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ billion images `[tineye.com] <https://tineye.com/how>`_.

		from urllib.parse import urlencode
		from datetime import datetime
		from flask_babel import gettext

		about = {
		"website": 'https://tineye.com',
		@@ -28,20 +29,41 @@ about = {
		}

		engine_type = 'online_url_search'
		""":py:obj:`searx.search.processors.online_url_search`"""

		categories = ['general']
		paging = True
		safesearch = False
		base_url = 'https://tineye.com'
		search_string = '/result_json/?page={page}&{query}'

		FORMAT_NOT_SUPPORTED = gettext(
		"Could not read that image url. This may be due to an unsupported file"
		" format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
		)
		"""TinEye error message"""

		NO_SIGNATURE_ERROR = gettext(
		"The image is too simple to find matches. TinEye requires a basic level of"
		" visual detail to successfully identify matches."
		)
		"""TinEye error message"""

		DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
		"""TinEye error message"""


		def request(query, params):
		"""Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`."""

		params['raise_for_httperror'] = False

		if params['search_urls']['data:image']:
		query = params['search_urls']['data:image']
		elif params['search_urls']['http']:
		query = params['search_urls']['http']

		logger.debug("query URL: %s", query)
		query = urlencode({'url': query})

		# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
		@@ -59,45 +81,145 @@ def request(query, params):
		return params


		def parse_tineye_match(match_json):
		"""Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
		object.

		Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__

		- `image_url`, link to the result image.
		- `domain`, domain this result was found on.
		- `score`, a number (0 to 100) that indicates how closely the images match.
		- `width`, image width in pixels.
		- `height`, image height in pixels.
		- `size`, image area in pixels.
		- `format`, image format.
		- `filesize`, image size in bytes.
		- `overlay`, overlay URL.
		- `tags`, whether this match belongs to a collection or stock domain.

		- `backlinks`, a list of Backlink objects pointing to the original websites
		and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
		<https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):

		- `url`, the image URL to the image.
		- `backlink`, the original website URL.
		- `crawl_date`, the date the image was crawled.

		"""

		# HINT: there exists an alternative backlink dict in the domains list / e.g.::
		#
		# match_json['domains'][0]['backlinks']

		backlinks = []
		if "backlinks" in match_json:

		for backlink_json in match_json["backlinks"]:
		if not isinstance(backlink_json, dict):
		continue

		crawl_date = backlink_json.get("crawl_date")
		if crawl_date:
		crawl_date = datetime.fromisoformat(crawl_date[:-3])
		else:
		crawl_date = datetime.min

		backlinks.append(
		{
		'url': backlink_json.get("url"),
		'backlink': backlink_json.get("backlink"),
		'crawl_date': crawl_date,
		'image_name': backlink_json.get("image_name"),
		}
		)

		return {
		'image_url': match_json.get("image_url"),
		'domain': match_json.get("domain"),
		'score': match_json.get("score"),
		'width': match_json.get("width"),
		'height': match_json.get("height"),
		'size': match_json.get("size"),
		'image_format': match_json.get("format"),
		'filesize': match_json.get("filesize"),
		'overlay': match_json.get("overlay"),
		'tags': match_json.get("tags"),
		'backlinks': backlinks,
		}


		def response(resp):
		"""Parse HTTP response from TinEye."""
		results = []

		# Define wanted results
		try:
		json_data = resp.json()
		number_of_results = json_data['num_matches']

		for i in json_data['matches']:
		image_format = i['format']
		width = i['width']
		height = i['height']
		thumbnail_src = i['image_url']
		backlink = i['domains'][0]['backlinks'][0]
		url = backlink['backlink']
		source = backlink['url']
		title = backlink['image_name']
		img_src = backlink['url']

		# Get and convert published date
		api_date = backlink['crawl_date'][:-3]
		publishedDate = datetime.fromisoformat(api_date)

		# Append results
		except Exception as exc: # pylint: disable=broad-except
		msg = "can't parse JSON response // %s" % exc
		logger.error(msg)
		json_data = {'error': msg}

		# handle error codes from Tineye

		if resp.is_error:
		if resp.status_code in (400, 422):

		message = 'HTTP status: %s' % resp.status_code
		error = json_data.get('error')
		s_key = json_data.get('suggestions', {}).get('key', '')

		if error and s_key:
		message = "%s (%s)" % (error, s_key)
		elif error:
		message = error

		if s_key == "Invalid image URL":
		# test https://docs.searxng.org/_static/searxng-wordmark.svg
		message = FORMAT_NOT_SUPPORTED
		elif s_key == 'NO_SIGNATURE_ERROR':
		# test https://pngimg.com/uploads/dot/dot_PNG4.png
		message = NO_SIGNATURE_ERROR
		elif s_key == 'Download Error':
		# test https://notexists
		message = DOWNLOAD_ERROR

		# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
		# results.append({'answer': message})
		logger.error(message)

		return results

		resp.raise_for_status()

		# append results from matches

		for match_json in json_data['matches']:

		tineye_match = parse_tineye_match(match_json)
		if not tineye_match['backlinks']:
		continue

		backlink = tineye_match['backlinks'][0]
		results.append(
		{
		'template': 'images.html',
		'url': url,
		'thumbnail_src': thumbnail_src,
		'source': source,
		'title': title,
		'img_src': img_src,
		'format': image_format,
		'widht': width,
		'height': height,
		'publishedDate': publishedDate,
		'url': backlink['backlink'],
		'thumbnail_src': tineye_match['image_url'],
		'source': backlink['url'],
		'title': backlink['image_name'],
		'img_src': backlink['url'],
		'format': tineye_match['image_format'],
		'widht': tineye_match['width'],
		'height': tineye_match['height'],
		'publishedDate': backlink['crawl_date'],
		}
		)

		# Append number of results
		# append number of results

		number_of_results = json_data.get('num_matches')
		if number_of_results:
		results.append({'number_of_results': number_of_results})

		return results