[mod] implement brave (WEB) engine to replace XPath configuration (460bbe5b) · Commits · e / infra / spot

docs/dev/engines/online/brave.rst

0 → 100644

+13 −0

Original line number	Diff line number	Diff line
		.. _brave engine:

		=============
		Brave Engines
		=============

		.. contents:: Contents
		:depth: 2
		:local:
		:backlinks: entry

		.. automodule:: searx.engines.brave
		:members:

searx/engines/brave.py

+236 −38

Original line number	Diff line number	Diff line
		# SPDX-License-Identifier: AGPL-3.0-or-later
		# lint: pylint
		"""Brave supports the categories listed in :py:obj:`brave_category` (General,
		news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
		<time_range_support>` is limited (see remarks).

		Configured ``brave`` engines:

		.. code:: yaml

		- name: brave
		engine: brave
		...
		brave_category: search
		time_range_support: true
		paging: true

		- name: brave.images
		engine: brave
		...
		brave_category: images

		- name: brave.videos
		engine: brave
		...
		brave_category: videos

		- name: brave.news
		engine: brave
		...
		brave_category: news


		Implementations
		===============

		"""
		Brave (General, news, videos, images)
		"""
		# pylint: disable=fixme

		from urllib.parse import (
		urlencode,
		urlparse,
		parse_qs,
		)

		from urllib.parse import urlencode
		import chompjs
		from lxml import html

		from searx.utils import (
		extract_text,
		eval_xpath_list,
		eval_xpath_getindex,
		)

		about = {
		"website": 'https://search.brave.com/',
		@@ -14,41 +60,87 @@ about = {
		"require_api_key": False,
		"results": 'HTML',
		}

		base_url = "https://search.brave.com/"
		categories = []
		brave_category = 'search'
		"""Brave supports common web-search, video search, image and video search.

		- ``search``: Common WEB search
		- ``videos``: search for videos
		- ``images``: search for images
		- ``news``: search for news
		"""

		brave_spellcheck = False
		"""Brave supports some kind of spell checking. When activated, Brave tries to
		fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
		the UI of Brave the user gets warned about this, since we can not warn the user
		in SearXNG, the spellchecking is disabled by default.
		"""

		send_accept_language_header = True
		paging = False
		categories = ['images', 'videos', 'news'] # images, videos, news
		"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
		category All)."""

		safesearch = True
		safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off

		time_range_support = False
		"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
		category All)."""

		time_range_map = {
		'day': 'pd',
		'week': 'pw',
		'month': 'pm',
		'year': 'py',
		}


		def request(query, params):

		# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
		params['headers']['Accept-Encoding'] = 'gzip, deflate'

		args = {
		'q': query,
		'spellcheck': 1,
		}
		params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}"
		if brave_spellcheck:
		args['spellcheck'] = '1'

		if brave_category == 'search':
		if params.get('pageno', 1) - 1:
		args['offset'] = params.get('pageno', 1) - 1
		if time_range_map.get(params['time_range']):
		args['tf'] = time_range_map.get(params['time_range'])

		def get_video_results(json_data):
		results = []
		params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"

		for result in json_data:
		results.append(
		{
		'template': 'videos.html',
		'url': result['url'],
		'thumbnail_src': result['thumbnail']['src'],
		'img_src': result['properties']['url'],
		'content': result['description'],
		'title': result['title'],
		'source': result['source'],
		'duration': result['video']['duration'],
		}
		)
		# set preferences in cookie
		params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')

		return results
		# ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
		# limited and the country handling has it quirks

		eng_locale = params.get('searxng_locale')
		params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country'
		params['cookies']['summarizer'] = '0'

		if not eng_locale or eng_locale == 'all':
		params['cookies']['country'] = 'all' # country=all
		else:
		params['cookies']['country'] = eng_locale.split('-')[-1].lower()
		params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()

		# logger.debug("cookies %s", params['cookies'])


		def response(resp):
		results = []

		if brave_category == 'search':
		return _parse_search(resp)

		datastr = ""
		for line in resp.text.split("\n"):
		@@ -57,29 +149,135 @@ def response(resp):
		break

		json_data = chompjs.parse_js_object(datastr)

		json_resp = json_data[1]['data']['body']['response']
		if categories[0] == 'news':

		if brave_category == 'news':
		json_resp = json_resp['news']
		return _parse_news(json_resp)

		if brave_category == 'images':
		return _parse_images(json_resp)
		if brave_category == 'videos':
		return _parse_videos(json_resp)

		return []


		def _parse_search(resp):

		result_list = []
		dom = html.fromstring(resp.text)

		answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
		if answer_tag:
		result_list.append({'answer': extract_text(answer_tag)})

		# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
		xpath_results = '//div[contains(@class, "snippet")]'

		for result in eval_xpath_list(dom, xpath_results):

		url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
		title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
		if not (url and title_tag):
		continue

		content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
		img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')

		item = {
		'url': url,
		'title': extract_text(title_tag),
		'content': extract_text(content_tag),
		'img_src': img_src,
		}

		video_tag = eval_xpath_getindex(
		result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
		)
		if video_tag:

		# In my tests a video tag in the WEB search was mostoften not a
		# video, except the ones from youtube ..

		iframe_src = _get_iframe_src(url)
		if iframe_src:
		item['iframe_src'] = iframe_src
		item['template'] = 'videos.html'
		item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
		else:
		item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')

		result_list.append(item)

		return result_list


		def _get_iframe_src(url):
		parsed_url = urlparse(url)
		if parsed_url.path == '/watch' and parsed_url.query:
		video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
		if video_id:
		return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
		return None


		def _parse_news(json_resp):
		result_list = []

		for result in json_resp["results"]:
		item = {
		'url': result['url'],
		'title': result['title'],
		'content': result['description'],
		}
		if result['thumbnail'] != "null":
		item['img_src'] = result['thumbnail']['src']
		result_list.append(item)

		return result_list


		def _parse_images(json_resp):
		result_list = []

		for result in json_resp["results"]:
		item = {
		'url': result['url'],
		'title': result['title'],
		'content': result['description'],
		'template': 'images.html',
		'img_format': result['properties']['format'],
		'source': result['source'],
		'img_src': result['properties']['url'],
		}
		result_list.append(item)

		return result_list


		def _parse_videos(json_resp):
		result_list = []

		for result in json_resp["results"]:

		url = result['url']
		item = {
		'url': url,
		'title': result['title'],
		'content': result['description'],
		'template': 'videos.html',
		'length': result['video']['duration'],
		'duration': result['video']['duration'],
		}

		if result['thumbnail'] != "null":
		item['thumbnail'] = result['thumbnail']['src']

		if categories[0] == 'images':
		item['template'] = 'images.html'
		item['img_format'] = result['properties']['format']
		item['source'] = result['source']
		item['img_src'] = result['properties']['url']
		elif categories[0] == 'videos':
		item['template'] = 'videos.html'
		item['length'] = result['video']['duration']
		iframe_src = _get_iframe_src(url)
		if iframe_src:
		item['iframe_src'] = iframe_src

		results.append(item)
		result_list.append(item)

		return results
		return result_list

searx/settings.yml

+16 −32

Original line number	Diff line number	Diff line
		@@ -1816,50 +1816,34 @@ engines:
		timeout: 9.0

		- name: brave
		shortcut: brave
		engine: xpath
		paging: true
		engine: brave
		shortcut: br
		time_range_support: true
		first_page_num: 0
		time_range_url: "&tf={time_range_val}"
		search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range}
		url_xpath: //a[@class="result-header"]/@href
		title_xpath: //span[@class="snippet-title"]
		content_xpath: //p[1][@class="snippet-description"]
		suggestion_xpath: //div[@class="text-gray h6"]/a
		time_range_map:
		day: 'pd'
		week: 'pw'
		month: 'pm'
		year: 'py'
		paging: true
		categories: [general, web]
		disabled: true
		headers:
		Accept-Encoding: gzip, deflate
		about:
		website: https://brave.com/search/
		wikidata_id: Q107355971
		use_official_api: false
		require_api_key: false
		results: HTML
		brave_category: search
		# brave_spellcheck: true

		- name: brave.images
		shortcut: braveimg
		engine: brave
		categories: images
		disabled: true
		network: brave
		shortcut: brimg
		categories: [images, web]
		brave_category: images

		- name: brave.videos
		shortcut: bravevid
		engine: brave
		categories: videos
		disabled: true
		network: brave
		shortcut: brvid
		categories: [videos, web]
		brave_category: videos

		- name: brave.news
		shortcut: bravenews
		engine: brave
		network: brave
		shortcut: brnews
		categories: news
		disabled: true
		brave_category: news

		- name: petalsearch
		shortcut: pts