fix ddg (d0177917) · Commits · e / infra / spot

requirements-dev.txt

+0 −1

Original line number	Original line	Diff line number	Diff line
	@@ -19,5 +19,4 @@ sphinx-autobuild==2021.3.14
	linuxdoc==20211220		linuxdoc==20211220
	aiounittest==1.4.1		aiounittest==1.4.1
	numexpr==2.8.1		numexpr==2.8.1
	werkzeug==2.0.3
	wrapt-timeout-decorator==1.3.8		wrapt-timeout-decorator==1.3.8

requirements.txt

+0 −1

Original line number	Original line	Diff line number	Diff line
	@@ -19,7 +19,6 @@ setproctitle==1.3.1
	redis==3.4.1		redis==3.4.1
	ring==0.7.3		ring==0.7.3
	numexpr==2.8.1		numexpr==2.8.1
	werkzeug==2.0.3
	wrapt-timeout-decorator==1.3.8		wrapt-timeout-decorator==1.3.8
	pyyaml==6.0		pyyaml==6.0
	requests [socks]==2.28.1		requests [socks]==2.28.1

searx/engines/duckduckgo.py

+28 −48

Original line number	Original line	Diff line number	Diff line
	@@ -13,7 +13,7 @@ from searx.network import get
	logger = logger.getChild('ddg engine')		logger = logger.getChild('ddg engine')
	# about		# about
	about = {		about = {
	"website": 'https://lite.duckduckgo.com/lite',		"website": 'https://duckduckgo.com/',
	"wikidata_id": 'Q12805',		"wikidata_id": 'Q12805',
	"official_api_documentation": 'https://duckduckgo.com/api',		"official_api_documentation": 'https://duckduckgo.com/api',
	"use_official_api": False,		"use_official_api": False,
	@@ -22,11 +22,13 @@ about = {
	}		}

	# engine dependent config		# engine dependent config
	categories = ['general', 'web']		categories = ['general']
	paging = True		paging = True
	supported_languages_url = 'https://duckduckgo.com/util/u588.js'		supported_languages_url = 'https://duckduckgo.com/util/u172.js'
			number_of_results = 10
	time_range_support = True		time_range_support = True
			safesearch = True
			VQD_REGEX = r"vqd='(\d+-\d+-\d+)'"
	language_aliases = {		language_aliases = {
	'ca-ES': 'ct-ca',		'ca-ES': 'ct-ca',
	'de-AT': 'de-de',		'de-AT': 'de-de',
	@@ -43,14 +45,16 @@ language_aliases = {
	'ko': 'kr-KR',		'ko': 'kr-KR',
	'sl-SI': 'sl-SL',		'sl-SI': 'sl-SL',
	'zh-TW': 'tzh-TW',		'zh-TW': 'tzh-TW',
	'zh-HK': 'tzh-HK',		'zh-HK': 'tzh-HK'
	}		}

	time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}

	# search-url		# search-url
	url = 'https://lite.duckduckgo.com/lite'		url = 'https://links.duckduckgo.com/d.js?'
	url_ping = 'https://duckduckgo.com/t/sl_l'		url_ping = 'https://duckduckgo.com/t/sl_h'
			time_range_dict = {'day': 'd',
			'week': 'w',
			'month': 'm',
			'year': 'y'}


	# match query's language to a region code that duckduckgo will accept		# match query's language to a region code that duckduckgo will accept
	@@ -65,16 +69,15 @@ def get_region_code(lang, lang_list=None):
	return lang_parts[1].lower() + '-' + lang_parts[0].lower()		return lang_parts[1].lower() + '-' + lang_parts[0].lower()


	def request(query, params):		def get_vqd(query, headers):
			resp = get(f"https://duckduckgo.com/?q={query}&ia=web", headers=headers)
			resp = re.findall(VQD_REGEX, resp.text)
			return resp[0]

	params['url'] = url
	params['method'] = 'POST'

	params['data']['q'] = query		def request(query, params):

	# The API is not documented, so we do some reverse engineering and emulate		params['method'] = 'GET'
	# what https://lite.duckduckgo.com/lite/ does when you press "next Page"
	# link again and again ..

	vqd = get_vqd(query, params["headers"])		vqd = get_vqd(query, params["headers"])
	dl, ct = match_language(params["language"], supported_languages, language_aliases, 'wt-WT').split("-")		dl, ct = match_language(params["language"], supported_languages, language_aliases, 'wt-WT').split("-")
	@@ -134,40 +137,17 @@ def request(query, params):

	# get response from search-request		# get response from search-request
	def response(resp):		def response(resp):

	headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
	get(url_ping, headers=headers_ping)

	if resp.status_code == 303:		if resp.status_code == 303:
	return []		return []

			# parse the response
	results = []		results = []
	doc = fromstring(resp.text)

	result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
	if not len(result_table) >= 3:
	# no more results
	return []
	result_table = result_table[2]

	tr_rows = eval_xpath(result_table, './/tr')		data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text))
			try:
	# In the last <tr> is the form of the 'previous/next page' links		search_data = loads(data[0].replace('/\t/g', ' '))
	tr_rows = tr_rows[:-1]		except IndexError:
			return
	len_tr_rows = len(tr_rows)
	offset = 0

	while len_tr_rows >= offset + 4:

	# assemble table rows we need to scrap
	tr_title = tr_rows[offset]
	tr_content = tr_rows[offset + 1]
	offset += 4

	# ignore sponsored Adds <tr class="result-sponsored">
	if tr_content.get('class') == 'result-sponsored':
	continue

	if len(search_data) == 1 and ('n' not in search_data[0]):		if len(search_data) == 1 and ('n' not in search_data[0]):
	only_result = search_data[0]		only_result = search_data[0]
	@@ -175,8 +155,8 @@ def response(resp):
	only_result.get('a') is not None or only_result.get('d') == 'google.com search'):		only_result.get('a') is not None or only_result.get('d') == 'google.com search'):
	return		return

	td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)		for search_result in search_data:
	if td_content is None:		if 'n' in search_result:
	continue		continue

	title = HTMLTextExtractor()		title = HTMLTextExtractor()