[mod] Startpage: reversed engineered & upgrade to data_type: traits_v1 (e9afc4f8) · Commits · e / infra / spot

docs/src/searx.engines.startpage.rst

+1 −6

Original line number	Original line	Diff line number	Diff line
	@@ -10,9 +10,4 @@ Startpage engines
	:backlinks: entry		:backlinks: entry

	.. automodule:: searx.engines.startpage		.. automodule:: searx.engines.startpage
			:members:
	Functions
	=========

	.. autofunction:: searx.engines.startpage.fetch_traits
	.. autofunction:: searx.engines.startpage.get_sc_code

searx/autocomplete.py

+3 −3

Original line number	Original line	Diff line number	Diff line
	@@ -109,9 +109,9 @@ def seznam(query, _lang):
	]		]


	def startpage(query, lang):		def startpage(query, sxng_locale):
	# startpage autocompleter		"""Autocomplete from Startpage. Supports Startpage's languages"""
	lui = engines['startpage'].supported_languages.get(lang, 'english') # vintage / deprecated		lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
	url = 'https://startpage.com/suggestions?{query}'		url = 'https://startpage.com/suggestions?{query}'
	resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))		resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
	data = resp.json()		data = resp.json()

searx/data/engine_traits.json

+2 −252

Original line number	Original line	Diff line number	Diff line
	@@ -3078,7 +3078,7 @@
	"startpage": {		"startpage": {
	"all_locale": null,		"all_locale": null,
	"custom": {},		"custom": {},
	"data_type": "supported_languages",		"data_type": "traits_v1",
	"languages": {		"languages": {
	"af": "afrikaans",		"af": "afrikaans",
	"am": "amharic",		"am": "amharic",
	@@ -3213,257 +3213,7 @@
	"zh-HK": "zh-TW_HK",		"zh-HK": "zh-TW_HK",
	"zh-TW": "zh-TW_TW"		"zh-TW": "zh-TW_TW"
	},		},
	"supported_languages": {		"supported_languages": {}
	"af": {
	"alias": "afrikaans"
	},
	"am": {
	"alias": "amharic"
	},
	"ar": {
	"alias": "arabic"
	},
	"az": {
	"alias": "azerbaijani"
	},
	"be": {
	"alias": "belarusian"
	},
	"bg": {
	"alias": "bulgarian"
	},
	"bn": {
	"alias": "bengali"
	},
	"bs": {
	"alias": "bosnian"
	},
	"ca": {
	"alias": "catalan"
	},
	"cs": {
	"alias": "czech"
	},
	"cy": {
	"alias": "welsh"
	},
	"da": {
	"alias": "dansk"
	},
	"de": {
	"alias": "deutsch"
	},
	"el": {
	"alias": "greek"
	},
	"en": {
	"alias": "english"
	},
	"en-GB": {
	"alias": "english_uk"
	},
	"eo": {
	"alias": "esperanto"
	},
	"es": {
	"alias": "espanol"
	},
	"et": {
	"alias": "estonian"
	},
	"eu": {
	"alias": "basque"
	},
	"fa": {
	"alias": "persian"
	},
	"fi": {
	"alias": "suomi"
	},
	"fo": {
	"alias": "faroese"
	},
	"fr": {
	"alias": "francais"
	},
	"fy": {
	"alias": "frisian"
	},
	"ga": {
	"alias": "irish"
	},
	"gd": {
	"alias": "gaelic"
	},
	"gl": {
	"alias": "galician"
	},
	"gu": {
	"alias": "gujarati"
	},
	"he": {
	"alias": "hebrew"
	},
	"hi": {
	"alias": "hindi"
	},
	"hr": {
	"alias": "croatian"
	},
	"hu": {
	"alias": "hungarian"
	},
	"ia": {
	"alias": "interlingua"
	},
	"id": {
	"alias": "indonesian"
	},
	"is": {
	"alias": "icelandic"
	},
	"it": {
	"alias": "italiano"
	},
	"ja": {
	"alias": "nihongo"
	},
	"jv": {
	"alias": "javanese"
	},
	"ka": {
	"alias": "georgian"
	},
	"kn": {
	"alias": "kannada"
	},
	"ko": {
	"alias": "hangul"
	},
	"la": {
	"alias": "latin"
	},
	"lt": {
	"alias": "lithuanian"
	},
	"lv": {
	"alias": "latvian"
	},
	"mai": {
	"alias": "bihari"
	},
	"mk": {
	"alias": "macedonian"
	},
	"ml": {
	"alias": "malayalam"
	},
	"mr": {
	"alias": "marathi"
	},
	"ms": {
	"alias": "malay"
	},
	"mt": {
	"alias": "maltese"
	},
	"ne": {
	"alias": "nepali"
	},
	"nl": {
	"alias": "nederlands"
	},
	"no": {
	"alias": "norsk"
	},
	"oc": {
	"alias": "occitan"
	},
	"pa": {
	"alias": "punjabi"
	},
	"pl": {
	"alias": "polski"
	},
	"pt": {
	"alias": "portugues"
	},
	"ro": {
	"alias": "romanian"
	},
	"ru": {
	"alias": "russian"
	},
	"si": {
	"alias": "sinhalese"
	},
	"sk": {
	"alias": "slovak"
	},
	"sl": {
	"alias": "slovenian"
	},
	"sq": {
	"alias": "albanian"
	},
	"sr": {
	"alias": "serbian"
	},
	"su": {
	"alias": "sudanese"
	},
	"sv": {
	"alias": "svenska"
	},
	"sw": {
	"alias": "swahili"
	},
	"ta": {
	"alias": "tamil"
	},
	"te": {
	"alias": "telugu"
	},
	"th": {
	"alias": "thai"
	},
	"ti": {
	"alias": "tigrinya"
	},
	"tl": {
	"alias": "tagalog"
	},
	"tr": {
	"alias": "turkce"
	},
	"uk": {
	"alias": "ukrainian"
	},
	"ur": {
	"alias": "urdu"
	},
	"uz": {
	"alias": "uzbek"
	},
	"vi": {
	"alias": "vietnamese"
	},
	"xh": {
	"alias": "xhosa"
	},
	"zh": {
	"alias": "jiantizhongwen"
	},
	"zh-HK": {
	"alias": "fantizhengwen"
	},
	"zh-TW": {
	"alias": "fantizhengwen"
	},
	"zu": {
	"alias": "zulu"
	}
	}
	},		},
	"wikidata": {		"wikidata": {
	"all_locale": null,		"all_locale": null,

searx/engines/startpage.py

+213 −150

Original line number	Original line	Diff line number	Diff line
	@@ -50,38 +50,58 @@ W3C recommends subtag over macrolanguage [2]_.
	Startpage languages		Startpage languages
	===================		===================

	The displayed name in Startpage's settings page depend on the location of the IP		:py:obj:`send_accept_language_header`:
	when the 'Accept-Language' HTTP header is unset (in the language update script		The displayed name in Startpage's settings page depend on the location of the
	we use "en-US,en;q=0.5" to get uniform names independent from the IP).		IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
			we use::

	Each option has a displayed name and a value, either of which may represent the		'Accept-Language': "en-US,en;q=0.5",
	language name in the native script, the language name in English, an English		..
	transliteration of the native name, the English name of the writing script used
	by the language, or occasionally something else entirely.		to get uniform names independent from the IP).

			.. _startpage categories:

			Startpage categories
			====================

			Startpage's category (for Web-search, News, Videos, ..) is set by
			:py:obj:`startpage_categ` in settings.yml::

			- name: startpage
			engine: startpage
			startpage_categ: web
			...

			.. hint::

			The default category is ``web`` .. and other categories than ``web`` are not
			yet implemented.

	"""		"""

			from typing import TYPE_CHECKING
			from collections import OrderedDict
	import re		import re
	from time import time

	from urllib.parse import urlencode
	from unicodedata import normalize, combining		from unicodedata import normalize, combining
			from time import time
	from datetime import datetime, timedelta		from datetime import datetime, timedelta

	from dateutil import parser		import dateutil.parser
	from lxml import html		import lxml.html
	from babel import Locale		import babel
	from babel.localedata import locale_identifiers

	from searx import network		from searx import network
	from searx.utils import extract_text, eval_xpath, match_language		from searx.utils import extract_text, eval_xpath, gen_useragent
	from searx.exceptions import (		from searx.exceptions import SearxEngineCaptchaException
	SearxEngineResponseException,		from searx.locales import region_tag
	SearxEngineCaptchaException,
	)

	from searx.enginelib.traits import EngineTraits		from searx.enginelib.traits import EngineTraits

			if TYPE_CHECKING:
			import logging

			logger: logging.Logger

	traits: EngineTraits		traits: EngineTraits

	# about		# about
	@@ -94,18 +114,28 @@ about = {
	"results": 'HTML',		"results": 'HTML',
	}		}

			startpage_categ = 'web'
			"""Startpage's category, visit :ref:`startpage categories`.
			"""

			send_accept_language_header = True
			"""Startpage tries to guess user's language and territory from the HTTP
			``Accept-Language``. Optional the user can select a search-language (can be
			different to the UI language) and a region filter.
			"""

	# engine dependent config		# engine dependent config
	categories = ['general', 'web']		categories = ['general', 'web']
	# there is a mechanism to block "bot" search
	# (probably the parameter qid), require
	# storing of qid's between mulitble search-calls

	paging = True		paging = True
	supported_languages_url = 'https://www.startpage.com/do/settings'		time_range_support = True
			safesearch = True

			time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
			safesearch_dict = {0: '0', 1: '1', 2: '1'}

	# search-url		# search-url
	base_url = 'https://startpage.com/'		base_url = 'https://www.startpage.com'
	search_url = base_url + 'sp/search?'		search_url = base_url + '/sp/search'

	# specific xpath variables		# specific xpath variables
	# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]		# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
	@@ -113,92 +143,193 @@ search_url = base_url + 'sp/search?'
	results_xpath = '//div[@class="w-gl__result__main"]'		results_xpath = '//div[@class="w-gl__result__main"]'
	link_xpath = './/a[@class="w-gl__result-title result-link"]'		link_xpath = './/a[@class="w-gl__result-title result-link"]'
	content_xpath = './/p[@class="w-gl__description"]'		content_xpath = './/p[@class="w-gl__description"]'
			search_form_xpath = '//form[@id="search"]'
			"""XPath of Startpage's origin search form

			.. code: html

			<form action="/sp/search" method="post">
			<input type="text" name="query" value="" ..>
			<input type="hidden" name="t" value="device">
			<input type="hidden" name="lui" value="english">
			<input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
			<input type="hidden" name="cat" value="web">
			<input type="hidden" class="abp" id="abp-input" name="abp" value="1">
			</form>
			"""

	# timestamp of the last fetch of 'sc' code		# timestamp of the last fetch of 'sc' code
	sc_code_ts = 0		sc_code_ts = 0
	sc_code = ''		sc_code = ''
			sc_code_cache_sec = 30
			"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""


	def raise_captcha(resp):		def get_sc_code(searxng_locale, params):
			"""Get an actual ``sc`` argument from Startpage's search form (HTML page).

	if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):		Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
	raise SearxEngineCaptchaException()		<search_form_xpath>`. Without this argument Startpage considers the request
			is from a bot. We do not know what is encoded in the value of the ``sc``
			argument, but it seems to be a kind of a time-stamp.

			Startpage's search form generates a new sc-code on each request. This
			function scrap a new sc-code from Startpage's home page every
			:py:obj:`sc_code_cache_sec` seconds.

	def get_sc_code(headers):		"""
	"""Get an actual ``sc`` argument from Startpage's home page.

	Startpage puts a ``sc`` argument on every link. Without this argument		global sc_code_ts, sc_code # pylint: disable=global-statement
	Startpage considers the request is from a bot. We do not know what is
	encoded in the value of the ``sc`` argument, but it seems to be a kind of a
	time-stamp. This time-stamp is valid for a few hours.

	This function scrap a new time-stamp from startpage's home page every hour		if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
	(3000 sec).		logger.debug("get_sc_code: reuse '%s'", sc_code)
			return sc_code

	"""		headers = {**params['headers']}
			headers['Origin'] = base_url
			headers['Referer'] = base_url + '/'
			# headers['Connection'] = 'keep-alive'
			# headers['Accept-Encoding'] = 'gzip, deflate, br'
			# headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8'
			# headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'

			# add Accept-Language header
			if searxng_locale == 'all':
			searxng_locale = 'en-US'
			locale = babel.Locale.parse(searxng_locale, sep='-')

			if send_accept_language_header:
			ac_lang = locale.language
			if locale.territory:
			ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
			locale.language,
			locale.territory,
			locale.language,
			)
			headers['Accept-Language'] = ac_lang

	global sc_code_ts, sc_code # pylint: disable=global-statement		get_sc_url = base_url + '/?sc=%s' % (sc_code)
			logger.debug("query new sc time-stamp ... %s", get_sc_url)
			logger.debug("headers: %s", headers)
			resp = network.get(get_sc_url, headers=headers)

	if time() > (sc_code_ts + 3000):		# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
	logger.debug("query new sc time-stamp ...")		# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
			# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21

	resp = network.get(base_url, headers=headers)		if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
	raise_captcha(resp)		raise SearxEngineCaptchaException(
	dom = html.fromstring(resp.text)		message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
			)

			dom = lxml.html.fromstring(resp.text)

	try:		try:
	# <input type="hidden" name="sc" value="...">		sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
	sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
	except IndexError as exc:		except IndexError as exc:
	# suspend startpage API --> https://github.com/searxng/searxng/pull/695		logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
	raise SearxEngineResponseException(		raise SearxEngineCaptchaException(
	suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"		message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
	) from exc		) from exc

	sc_code_ts = time()		sc_code_ts = time()
	logger.debug("new value is: %s", sc_code)		logger.debug("get_sc_code: new value is: %s", sc_code)

	return sc_code		return sc_code


	# do search-request
	def request(query, params):		def request(query, params):
			"""Assemble a Startpage request.

	# pylint: disable=line-too-long		To avoid CAPTCHA we need to send a well formed HTTP POST request with a
	# The format string from Startpage's FFox add-on [1]::		cookie. We need to form a request that is identical to the request build by
	#		Startpage's search form:
	# https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
	#
	# [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/

			- in the cookie the region is selected
			- in the HTTP POST data the language is selected

			Additionally the arguments form Startpage's search form needs to be set in
			HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
			"""
			if startpage_categ == 'web':
			return _request_cat_web(query, params)

			logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
			return params


			def _request_cat_web(query, params):

			engine_region = traits.get_region(params['searxng_locale'], 'en-US')
			engine_language = traits.get_language(params['searxng_locale'], 'en')

			# build arguments
	args = {		args = {
	'query': query,		'query': query,
	'page': params['pageno'],
	'cat': 'web',		'cat': 'web',
	# 'pl': 'ext-ff',		't': 'device',
	# 'extVersion': '1.3.0',		'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
	# 'abp': "-1",		'with_date': time_range_dict.get(params['time_range'], ''),
	'sc': get_sc_code(params['headers']),
	}		}

	# set language if specified		if engine_language:
	if params['language'] != 'all':		args['language'] = engine_language
	lang_code = match_language(params['language'], supported_languages, fallback=None)		args['lui'] = engine_language
	if lang_code:
	language_name = supported_languages[lang_code]['alias']		args['abp'] = '1'
	args['language'] = language_name		if params['pageno'] > 1:
	args['lui'] = language_name		args['page'] = params['pageno']

			# build cookie
			lang_homepage = 'en'
			cookie = OrderedDict()
			cookie['date_time'] = 'world'
			cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
			cookie['disable_open_in_new_window'] = '0'
			cookie['enable_post_method'] = '1' # hint: POST
			cookie['enable_proxy_safety_suggest'] = '1'
			cookie['enable_stay_control'] = '1'
			cookie['instant_answers'] = '1'
			cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
			cookie['num_of_results'] = '10'
			cookie['suggestions'] = '1'
			cookie['wt_unit'] = 'celsius'

			if engine_language:
			cookie['language'] = engine_language
			cookie['language_ui'] = engine_language

			if engine_region:
			cookie['search_results_region'] = engine_region

			params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
			logger.debug('cookie preferences: %s', params['cookies']['preferences'])

			# POST request
			logger.debug("data: %s", args)
			params['data'] = args
			params['method'] = 'POST'
			params['url'] = search_url
			params['headers']['Origin'] = base_url
			params['headers']['Referer'] = base_url + '/'
			# is the Accept header needed?
			# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'

	params['url'] = search_url + urlencode(args)
	return params		return params


	# get response from search-request		# get response from search-request
	def response(resp):		def response(resp):
	results = []		dom = lxml.html.fromstring(resp.text)

	dom = html.fromstring(resp.text)		if startpage_categ == 'web':
			return _response_cat_web(dom)

			logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
			return []


			def _response_cat_web(dom):
			results = []

	# parse results		# parse results
	for result in eval_xpath(dom, results_xpath):		for result in eval_xpath(dom, results_xpath):
	@@ -233,7 +364,7 @@ def response(resp):
	content = content[date_pos:]		content = content[date_pos:]

	try:		try:
	published_date = parser.parse(date_string, dayfirst=True)		published_date = dateutil.parser.parse(date_string, dayfirst=True)
	except ValueError:		except ValueError:
	pass		pass

	@@ -259,78 +390,10 @@ def response(resp):
	return results		return results


	# get supported languages from their site
	def _fetch_supported_languages(resp):
	# startpage's language selector is a mess each option has a displayed name
	# and a value, either of which may represent the language name in the native
	# script, the language name in English, an English transliteration of the
	# native name, the English name of the writing script used by the language,
	# or occasionally something else entirely.

	# this cases are so special they need to be hardcoded, a couple of them are misspellings
	language_names = {
	'english_uk': 'en-GB',
	'fantizhengwen': ['zh-TW', 'zh-HK'],
	'hangul': 'ko',
	'malayam': 'ml',
	'norsk': 'nb',
	'sinhalese': 'si',
	'sudanese': 'su',
	}

	# get the English name of every language known by babel
	language_names.update(
	{
	# fmt: off
	name.lower(): lang_code
	# pylint: disable=protected-access
	for lang_code, name in Locale('en')._data['languages'].items()
	# fmt: on
	}
	)

	# get the native name of every language known by babel
	for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
	native_name = Locale(lang_code).get_language_name().lower()
	# add native name exactly as it is
	language_names[native_name] = lang_code

	# add "normalized" language name (i.e. français becomes francais and español becomes espanol)
	unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
	if len(unaccented_name) == len(unaccented_name.encode()):
	# add only if result is ascii (otherwise "normalization" didn't work)
	language_names[unaccented_name] = lang_code

	dom = html.fromstring(resp.text)
	sp_lang_names = []
	for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
	sp_lang_names.append((option.get('value'), extract_text(option).lower()))

	supported_languages = {}
	for sp_option_value, sp_option_text in sp_lang_names:
	lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
	if isinstance(lang_code, str):
	supported_languages[lang_code] = {'alias': sp_option_value}
	elif isinstance(lang_code, list):
	for _lc in lang_code:
	supported_languages[_lc] = {'alias': sp_option_value}
	else:
	print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))

	return supported_languages


	def fetch_traits(engine_traits: EngineTraits):		def fetch_traits(engine_traits: EngineTraits):
	"""Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage		"""Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
	regions>` from Startpage."""		regions>` from Startpage."""
	# pylint: disable=import-outside-toplevel, too-many-locals, too-many-branches		# pylint: disable=too-many-branches
	# pylint: disable=too-many-statements

	engine_traits.data_type = 'supported_languages' # deprecated

	import babel
	from searx.utils import gen_useragent
	from searx.locales import region_tag

	headers = {		headers = {
	'User-Agent': gen_useragent(),		'User-Agent': gen_useragent(),
	@@ -341,7 +404,7 @@ def fetch_traits(engine_traits: EngineTraits):
	if not resp.ok:		if not resp.ok:
	print("ERROR: response from Startpage is not OK.")		print("ERROR: response from Startpage is not OK.")

	dom = html.fromstring(resp.text)		dom = lxml.html.fromstring(resp.text)

	# regions		# regions