From fa4eb51c603970bc796b6884e1882665ecc6e7a4 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 23 Dec 2021 16:41:53 -0300 Subject: [PATCH 01/20] Implementing ddg main search engine --- searx/engines/duckduckgo.py | 140 +++++++---------------- searx/engines/duckduckgo_lite.py | 186 +++++++++++++++++++++++++++++++ searx/settings.yml | 4 + 3 files changed, 230 insertions(+), 100 deletions(-) create mode 100644 searx/engines/duckduckgo_lite.py diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index d871a629f..c8653e916 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,24 +1,16 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""DuckDuckGo Lite """ - -from json import loads + DuckDuckGo (Web) +""" from lxml.html import fromstring - -from searx.utils import ( - dict_subset, - eval_xpath, - eval_xpath_getindex, - extract_text, - match_language, -) +from json import loads +from searx.utils import extract_text, match_language, eval_xpath, dict_subset from searx.network import get # about about = { - "website": 'https://lite.duckduckgo.com/lite', + "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', "official_api_documentation": 'https://duckduckgo.com/api', "use_official_api": False, @@ -28,8 +20,8 @@ about = { # engine dependent config categories = ['general'] -paging = True -supported_languages_url = 'https://duckduckgo.com/util/u588.js' +paging = False +supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True language_aliases = { @@ -42,16 +34,20 @@ language_aliases = { 'zh-HK': 'tzh-HK' } -time_range_dict = { - 'day': 'd', - 'week': 'w', - 'month': 'm', - 'year': 'y' -} - # search-url -url = 'https://lite.duckduckgo.com/lite' -url_ping = 'https://duckduckgo.com/t/sl_l' +url = 'https://html.duckduckgo.com/html/?q={}' +url_ping = 'https://duckduckgo.com/t/sl_h' +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + +# specific xpath variables +result_xpath = '//div[@class="links_main links_deep result__body"]' # noqa +url_xpath = '//a[@class="result__snippet"]/@href' +title_xpath = '//a[@class="result__a"]' +content_xpath = '//a[@class="result__snippet"]' +correction_xpath = '//a[@id="js-spelling-recourse-link"]' # match query's language to a region code that duckduckgo will accept @@ -67,108 +63,52 @@ def get_region_code(lang, lang_list=None): def request(query, params): + if params['time_range'] is not None and params['time_range'] not in time_range_dict: + return params - params['url'] = url + params['url'] = url.format(query) params['method'] = 'POST' - params['data']['q'] = query - - # The API is not documented, so we do some reverse engineering and emulate - # what https://lite.duckduckgo.com/lite/ does when you press "next Page" - # link again and again .. - - params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' - - # initial page does not have an offset - if params['pageno'] == 2: - # second page does have an offset of 30 - offset = (params['pageno'] - 1) * 30 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - elif params['pageno'] > 2: - # third and following pages do have an offset of 30 + n*50 - offset = 30 + (params['pageno'] - 2) * 50 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - # initial page does not have additional data in the input form - if params['pageno'] > 1: - # request the second page (and more pages) needs 'o' and 'api' arguments - params['data']['o'] = 'json' - params['data']['api'] = 'd.js' - - # initial page does not have additional data in the input form - if params['pageno'] > 2: - # request the third page (and more pages) some more arguments - params['data']['nextParams'] = '' - params['data']['v'] = '' - params['data']['vqd'] = '' + params['data']['b'] = '' region_code = get_region_code(params['language'], supported_languages) if region_code: params['data']['kl'] = region_code params['cookies']['kl'] = region_code - params['data']['df'] = '' if params['time_range'] in time_range_dict: params['data']['df'] = time_range_dict[params['time_range']] - params['cookies']['df'] = time_range_dict[params['time_range']] + params['allow_redirects'] = False return params # get response from search-request def response(resp): - - headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) - get(url_ping, headers=headers_ping) - if resp.status_code == 303: return [] + # parse the response results = [] doc = fromstring(resp.text) + + titles = eval_xpath(doc, title_xpath) + contents = eval_xpath(doc, content_xpath) + urls = eval_xpath(doc, url_xpath) - result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') - if not len(result_table) >= 3: - # no more results - return [] - result_table = result_table[2] - - tr_rows = eval_xpath(result_table, './/tr') - - # In the last is the form of the 'previous/next page' links - tr_rows = tr_rows[:-1] - - len_tr_rows = len(tr_rows) - offset = 0 - - while len_tr_rows >= offset + 4: - - # assemble table rows we need to scrap - tr_title = tr_rows[offset] - tr_content = tr_rows[offset + 1] - offset += 4 - - # ignore sponsored Adds - if tr_content.get('class') == 'result-sponsored': - continue - - a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) - if a_tag is None: - continue + for title, content, url in zip(titles, contents, urls): + print(extract_text(content)) - td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) - if td_content is None: - continue + results.append({'title': extract_text(title), + 'content': extract_text(content), + 'url': url}) - results.append({ - 'title': a_tag.text_content(), - 'content': extract_text(td_content), - 'url': a_tag.get('href'), - }) + # parse correction + for correction in eval_xpath(doc, correction_xpath): + # append correction + results.append({'correction': extract_text(correction)}) + # return results return results diff --git a/searx/engines/duckduckgo_lite.py b/searx/engines/duckduckgo_lite.py new file mode 100644 index 000000000..d871a629f --- /dev/null +++ b/searx/engines/duckduckgo_lite.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""DuckDuckGo Lite +""" + +from json import loads + +from lxml.html import fromstring + +from searx.utils import ( + dict_subset, + eval_xpath, + eval_xpath_getindex, + extract_text, + match_language, +) +from searx.network import get + +# about +about = { + "website": 'https://lite.duckduckgo.com/lite', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general'] +paging = True +supported_languages_url = 'https://duckduckgo.com/util/u588.js' +time_range_support = True + +language_aliases = { + 'ar-SA': 'ar-XA', + 'es-419': 'es-XL', + 'ja': 'jp-JP', + 'ko': 'kr-KR', + 'sl-SI': 'sl-SL', + 'zh-TW': 'tzh-TW', + 'zh-HK': 'tzh-HK' +} + +time_range_dict = { + 'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y' +} + +# search-url +url = 'https://lite.duckduckgo.com/lite' +url_ping = 'https://duckduckgo.com/t/sl_l' + + +# match query's language to a region code that duckduckgo will accept +def get_region_code(lang, lang_list=None): + if lang == 'all': + return None + + lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') + lang_parts = lang_code.split('-') + + # country code goes first + return lang_parts[1].lower() + '-' + lang_parts[0].lower() + + +def request(query, params): + + params['url'] = url + params['method'] = 'POST' + + params['data']['q'] = query + + # The API is not documented, so we do some reverse engineering and emulate + # what https://lite.duckduckgo.com/lite/ does when you press "next Page" + # link again and again .. + + params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' + + # initial page does not have an offset + if params['pageno'] == 2: + # second page does have an offset of 30 + offset = (params['pageno'] - 1) * 30 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + elif params['pageno'] > 2: + # third and following pages do have an offset of 30 + n*50 + offset = 30 + (params['pageno'] - 2) * 50 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + # initial page does not have additional data in the input form + if params['pageno'] > 1: + # request the second page (and more pages) needs 'o' and 'api' arguments + params['data']['o'] = 'json' + params['data']['api'] = 'd.js' + + # initial page does not have additional data in the input form + if params['pageno'] > 2: + # request the third page (and more pages) some more arguments + params['data']['nextParams'] = '' + params['data']['v'] = '' + params['data']['vqd'] = '' + + region_code = get_region_code(params['language'], supported_languages) + if region_code: + params['data']['kl'] = region_code + params['cookies']['kl'] = region_code + + params['data']['df'] = '' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + params['cookies']['df'] = time_range_dict[params['time_range']] + + return params + + +# get response from search-request +def response(resp): + + headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) + get(url_ping, headers=headers_ping) + + if resp.status_code == 303: + return [] + + results = [] + doc = fromstring(resp.text) + + result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') + if not len(result_table) >= 3: + # no more results + return [] + result_table = result_table[2] + + tr_rows = eval_xpath(result_table, './/tr') + + # In the last is the form of the 'previous/next page' links + tr_rows = tr_rows[:-1] + + len_tr_rows = len(tr_rows) + offset = 0 + + while len_tr_rows >= offset + 4: + + # assemble table rows we need to scrap + tr_title = tr_rows[offset] + tr_content = tr_rows[offset + 1] + offset += 4 + + # ignore sponsored Adds + if tr_content.get('class') == 'result-sponsored': + continue + + a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) + if a_tag is None: + continue + + td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) + if td_content is None: + continue + + results.append({ + 'title': a_tag.text_content(), + 'content': extract_text(td_content), + 'url': a_tag.get('href'), + }) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + + # response is a js file with regions as an embedded object + response_page = resp.text + response_page = response_page[response_page.find('regions:{') + 8:] + response_page = response_page[:response_page.find('}') + 1] + + regions_json = loads(response_page) + supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + + return list(supported_languages) diff --git a/searx/settings.yml b/searx/settings.yml index 92be3fbeb..b2341f84e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -408,6 +408,10 @@ engines: engine : duckduckgo shortcut : ddg + - name : duckduckgo (lite) + engine : duckduckgo_lite + shortcut : ddgl + - name : duckduckgo images engine : duckduckgo_images shortcut : ddi -- GitLab From 009e75abe99b2c75d37b2994f0ede13e4c8bf0a1 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:12:48 -0300 Subject: [PATCH 02/20] Update ddg url and add safesearch --- searx/engines/duckduckgo.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index c8653e916..f93c7815f 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -23,6 +23,7 @@ categories = ['general'] paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True +safesearch = True language_aliases = { 'ar-SA': 'ar-XA', @@ -35,7 +36,7 @@ language_aliases = { } # search-url -url = 'https://html.duckduckgo.com/html/?q={}' +url = 'https://duckduckgo.com/?q={}' url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', @@ -67,10 +68,19 @@ def request(query, params): return params params['url'] = url.format(query) - params['method'] = 'POST' + params['method'] = 'GET' params['data']['q'] = query params['data']['b'] = '' + safesearch_ddg_value = None + if params['safesearch'] == 0: + safesearch_ddg_value = -2 # OFF + if params['safesearch'] == 2: + safesearch_ddg_value = 1 # STRICT + + if safesearch_ddg_value != None: + params['cookies']['p'] = str(safesearch_ddg_value) + region_code = get_region_code(params['language'], supported_languages) if region_code: params['data']['kl'] = region_code @@ -90,15 +100,14 @@ def response(resp): # parse the response results = [] + doc = fromstring(resp.text) - + titles = eval_xpath(doc, title_xpath) contents = eval_xpath(doc, content_xpath) - urls = eval_xpath(doc, url_xpath) + urls = eval_xpath(doc, url_xpath) for title, content, url in zip(titles, contents, urls): - print(extract_text(content)) - results.append({'title': extract_text(title), 'content': extract_text(content), 'url': url}) -- GitLab From 104d880bf50390503699ef2a951146b233f39992 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:22:05 -0300 Subject: [PATCH 03/20] Fix small pep8 issues --- searx/engines/duckduckgo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f93c7815f..a1055767f 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -74,11 +74,11 @@ def request(query, params): safesearch_ddg_value = None if params['safesearch'] == 0: - safesearch_ddg_value = -2 # OFF + safesearch_ddg_value = -2 # OFF if params['safesearch'] == 2: - safesearch_ddg_value = 1 # STRICT + safesearch_ddg_value = 1 # STRICT - if safesearch_ddg_value != None: + if safesearch_ddg_value is not None: params['cookies']['p'] = str(safesearch_ddg_value) region_code = get_region_code(params['language'], supported_languages) -- GitLab From 2859cfa4930192ae9e5d16978ff0b4bdbe1528dd Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:45:33 -0300 Subject: [PATCH 04/20] Remove unused imports --- searx/engines/duckduckgo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index a1055767f..e2b8ea4c8 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -5,8 +5,7 @@ from lxml.html import fromstring from json import loads -from searx.utils import extract_text, match_language, eval_xpath, dict_subset -from searx.network import get +from searx.utils import extract_text, match_language, eval_xpath # about about = { -- GitLab From 0c9cbcd05e227f8880821d9ee455abce2761c6f7 Mon Sep 17 00:00:00 2001 From: nivesh Date: Fri, 11 Feb 2022 01:35:33 +0530 Subject: [PATCH 05/20] inital version of safe search for ddg engine --- searx/engines/duckduckgo.py | 116 +++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 41 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index e2b8ea4c8..80d18c80d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -6,7 +6,10 @@ from lxml.html import fromstring from json import loads from searx.utils import extract_text, match_language, eval_xpath +from searx import logger +import re +logger = logger.getChild('ddg engine') # about about = { "website": 'https://duckduckgo.com/', @@ -23,7 +26,7 @@ paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True safesearch = True - +VQD_REGEX = r"vqd='(\d+-\d+-\d+)'/"; language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -35,21 +38,14 @@ language_aliases = { } # search-url -url = 'https://duckduckgo.com/?q={}' +url = 'https://links.duckduckgo.com/d.js?' + url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} -# specific xpath variables -result_xpath = '//div[@class="links_main links_deep result__body"]' # noqa -url_xpath = '//a[@class="result__snippet"]/@href' -title_xpath = '//a[@class="result__a"]' -content_xpath = '//a[@class="result__snippet"]' -correction_xpath = '//a[@id="js-spelling-recourse-link"]' - - # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=None): if lang == 'all': @@ -61,34 +57,65 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() +# def get_vqd(query): +# resp = requests.get def request(query, params): if params['time_range'] is not None and params['time_range'] not in time_range_dict: return params - params['url'] = url.format(query) params['method'] = 'GET' - params['data']['q'] = query - params['data']['b'] = '' - - safesearch_ddg_value = None - if params['safesearch'] == 0: - safesearch_ddg_value = -2 # OFF - if params['safesearch'] == 2: - safesearch_ddg_value = 1 # STRICT - if safesearch_ddg_value is not None: - params['cookies']['p'] = str(safesearch_ddg_value) - - region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['data']['kl'] = region_code - params['cookies']['kl'] = region_code - - if params['time_range'] in time_range_dict: - params['data']['df'] = time_range_dict[params['time_range']] + logger.debug(params) + + query_dict = { + "q": query, + 't': 'D', + 'l': params["language"], + 'kl': get_region_code(params["language"]), + 's': 0, # TODO + 'dl': 'en', + 'ct': 'US', + 'ss_mkt': get_region_code(params["language"]), + 'df': params['time_range'], + 'vqd' : "3-126340648549743517691069464246778236175-203846832012815914858366468471688211061", + 'ex': -2, + 'sp': '1', + 'bpa': '1', + 'biaexp': 'b', + 'msvrtexp': 'b' + } + if params['safesearch'] == 2: # STRICT + del query_dict['t'] + query_dict['p'] = 1 + query_dict.update({ + 'videxp': 'a', + 'nadse': 'b', + 'eclsexp': 'a', + 'stiaexp': 'a', + 'tjsexp': 'b', + 'related': 'b', + 'msnexp': 'a' + }) + elif params['safesearch'] == 1: # MODERATE + query_dict['ex'] = -1 + query_dict.update({ + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' + }) + else: # OFF + query_dict['ex'] = -2 + query_dict.update({ + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' + }) params['allow_redirects'] = False + params["data"] = query_dict + params["url"] = url + logger.debug(params) return params @@ -101,22 +128,29 @@ def response(resp): results = [] doc = fromstring(resp.text) - - titles = eval_xpath(doc, title_xpath) - contents = eval_xpath(doc, content_xpath) - urls = eval_xpath(doc, url_xpath) - - for title, content, url in zip(titles, contents, urls): - results.append({'title': extract_text(title), - 'content': extract_text(content), - 'url': url}) + data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text)) + search_data = loads(data[0].replace('/\t/g', ' ')) + + if len(search_data) == 1 and ('n' not in search_data[0]): + only_result = search_data[0] + if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or only_result.get('a') is not None or only_result.get('d') == 'google.com search'): + return + + + for search_result in search_data: + if 'n' in search_result: + continue + results.append({'title': search_result.get("t"), + 'content': extract_text(search_result.get('a')), + 'url': search_result.get('u')}) # parse correction - for correction in eval_xpath(doc, correction_xpath): - # append correction - results.append({'correction': extract_text(correction)}) + # for correction in eval_xpath(doc, correction_xpath): + # # append correction + # results.append({'correction': extract_text(correction)}) # return results + logger.debug(results) return results -- GitLab From 889e56235aa21c61b4d709ae9f701f1b64cd412b Mon Sep 17 00:00:00 2001 From: nivesh Date: Tue, 15 Feb 2022 01:14:01 +0530 Subject: [PATCH 06/20] remove debug logs --- searx/engines/duckduckgo.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 80d18c80d..f52edf22a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -8,6 +8,7 @@ from json import loads from searx.utils import extract_text, match_language, eval_xpath from searx import logger import re +import httpx logger = logger.getChild('ddg engine') # about @@ -26,7 +27,7 @@ paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True safesearch = True -VQD_REGEX = r"vqd='(\d+-\d+-\d+)'/"; +VQD_REGEX = r"vqd='(\d+-\d+-\d+)'"; language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -57,8 +58,10 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() -# def get_vqd(query): -# resp = requests.get +def get_vqd(query): + resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") + resp = re.findall(VQD_REGEX, resp.text) + return resp[0] def request(query, params): if params['time_range'] is not None and params['time_range'] not in time_range_dict: @@ -66,8 +69,7 @@ def request(query, params): params['method'] = 'GET' - logger.debug(params) - + vqd = get_vqd(query) query_dict = { "q": query, 't': 'D', @@ -78,7 +80,7 @@ def request(query, params): 'ct': 'US', 'ss_mkt': get_region_code(params["language"]), 'df': params['time_range'], - 'vqd' : "3-126340648549743517691069464246778236175-203846832012815914858366468471688211061", + 'vqd' : vqd, 'ex': -2, 'sp': '1', 'bpa': '1', @@ -115,7 +117,6 @@ def request(query, params): params['allow_redirects'] = False params["data"] = query_dict params["url"] = url - logger.debug(params) return params @@ -149,8 +150,6 @@ def response(resp): # # append correction # results.append({'correction': extract_text(correction)}) - # return results - logger.debug(results) return results -- GitLab From 0867163a4603a1ae3a5c4018712d3d022226760a Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 23 Dec 2021 16:41:53 -0300 Subject: [PATCH 07/20] Implementing ddg main search engine --- searx/engines/duckduckgo.py | 140 +++++++---------------- searx/engines/duckduckgo_lite.py | 186 +++++++++++++++++++++++++++++++ searx/settings.yml | 4 + 3 files changed, 230 insertions(+), 100 deletions(-) create mode 100644 searx/engines/duckduckgo_lite.py diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index d871a629f..c8653e916 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,24 +1,16 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""DuckDuckGo Lite """ - -from json import loads + DuckDuckGo (Web) +""" from lxml.html import fromstring - -from searx.utils import ( - dict_subset, - eval_xpath, - eval_xpath_getindex, - extract_text, - match_language, -) +from json import loads +from searx.utils import extract_text, match_language, eval_xpath, dict_subset from searx.network import get # about about = { - "website": 'https://lite.duckduckgo.com/lite', + "website": 'https://duckduckgo.com/', "wikidata_id": 'Q12805', "official_api_documentation": 'https://duckduckgo.com/api', "use_official_api": False, @@ -28,8 +20,8 @@ about = { # engine dependent config categories = ['general'] -paging = True -supported_languages_url = 'https://duckduckgo.com/util/u588.js' +paging = False +supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True language_aliases = { @@ -42,16 +34,20 @@ language_aliases = { 'zh-HK': 'tzh-HK' } -time_range_dict = { - 'day': 'd', - 'week': 'w', - 'month': 'm', - 'year': 'y' -} - # search-url -url = 'https://lite.duckduckgo.com/lite' -url_ping = 'https://duckduckgo.com/t/sl_l' +url = 'https://html.duckduckgo.com/html/?q={}' +url_ping = 'https://duckduckgo.com/t/sl_h' +time_range_dict = {'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y'} + +# specific xpath variables +result_xpath = '//div[@class="links_main links_deep result__body"]' # noqa +url_xpath = '//a[@class="result__snippet"]/@href' +title_xpath = '//a[@class="result__a"]' +content_xpath = '//a[@class="result__snippet"]' +correction_xpath = '//a[@id="js-spelling-recourse-link"]' # match query's language to a region code that duckduckgo will accept @@ -67,108 +63,52 @@ def get_region_code(lang, lang_list=None): def request(query, params): + if params['time_range'] is not None and params['time_range'] not in time_range_dict: + return params - params['url'] = url + params['url'] = url.format(query) params['method'] = 'POST' - params['data']['q'] = query - - # The API is not documented, so we do some reverse engineering and emulate - # what https://lite.duckduckgo.com/lite/ does when you press "next Page" - # link again and again .. - - params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' - - # initial page does not have an offset - if params['pageno'] == 2: - # second page does have an offset of 30 - offset = (params['pageno'] - 1) * 30 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - elif params['pageno'] > 2: - # third and following pages do have an offset of 30 + n*50 - offset = 30 + (params['pageno'] - 2) * 50 - params['data']['s'] = offset - params['data']['dc'] = offset + 1 - - # initial page does not have additional data in the input form - if params['pageno'] > 1: - # request the second page (and more pages) needs 'o' and 'api' arguments - params['data']['o'] = 'json' - params['data']['api'] = 'd.js' - - # initial page does not have additional data in the input form - if params['pageno'] > 2: - # request the third page (and more pages) some more arguments - params['data']['nextParams'] = '' - params['data']['v'] = '' - params['data']['vqd'] = '' + params['data']['b'] = '' region_code = get_region_code(params['language'], supported_languages) if region_code: params['data']['kl'] = region_code params['cookies']['kl'] = region_code - params['data']['df'] = '' if params['time_range'] in time_range_dict: params['data']['df'] = time_range_dict[params['time_range']] - params['cookies']['df'] = time_range_dict[params['time_range']] + params['allow_redirects'] = False return params # get response from search-request def response(resp): - - headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) - get(url_ping, headers=headers_ping) - if resp.status_code == 303: return [] + # parse the response results = [] doc = fromstring(resp.text) + + titles = eval_xpath(doc, title_xpath) + contents = eval_xpath(doc, content_xpath) + urls = eval_xpath(doc, url_xpath) - result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') - if not len(result_table) >= 3: - # no more results - return [] - result_table = result_table[2] - - tr_rows = eval_xpath(result_table, './/tr') - - # In the last is the form of the 'previous/next page' links - tr_rows = tr_rows[:-1] - - len_tr_rows = len(tr_rows) - offset = 0 - - while len_tr_rows >= offset + 4: - - # assemble table rows we need to scrap - tr_title = tr_rows[offset] - tr_content = tr_rows[offset + 1] - offset += 4 - - # ignore sponsored Adds - if tr_content.get('class') == 'result-sponsored': - continue - - a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) - if a_tag is None: - continue + for title, content, url in zip(titles, contents, urls): + print(extract_text(content)) - td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) - if td_content is None: - continue + results.append({'title': extract_text(title), + 'content': extract_text(content), + 'url': url}) - results.append({ - 'title': a_tag.text_content(), - 'content': extract_text(td_content), - 'url': a_tag.get('href'), - }) + # parse correction + for correction in eval_xpath(doc, correction_xpath): + # append correction + results.append({'correction': extract_text(correction)}) + # return results return results diff --git a/searx/engines/duckduckgo_lite.py b/searx/engines/duckduckgo_lite.py new file mode 100644 index 000000000..d871a629f --- /dev/null +++ b/searx/engines/duckduckgo_lite.py @@ -0,0 +1,186 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""DuckDuckGo Lite +""" + +from json import loads + +from lxml.html import fromstring + +from searx.utils import ( + dict_subset, + eval_xpath, + eval_xpath_getindex, + extract_text, + match_language, +) +from searx.network import get + +# about +about = { + "website": 'https://lite.duckduckgo.com/lite', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general'] +paging = True +supported_languages_url = 'https://duckduckgo.com/util/u588.js' +time_range_support = True + +language_aliases = { + 'ar-SA': 'ar-XA', + 'es-419': 'es-XL', + 'ja': 'jp-JP', + 'ko': 'kr-KR', + 'sl-SI': 'sl-SL', + 'zh-TW': 'tzh-TW', + 'zh-HK': 'tzh-HK' +} + +time_range_dict = { + 'day': 'd', + 'week': 'w', + 'month': 'm', + 'year': 'y' +} + +# search-url +url = 'https://lite.duckduckgo.com/lite' +url_ping = 'https://duckduckgo.com/t/sl_l' + + +# match query's language to a region code that duckduckgo will accept +def get_region_code(lang, lang_list=None): + if lang == 'all': + return None + + lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') + lang_parts = lang_code.split('-') + + # country code goes first + return lang_parts[1].lower() + '-' + lang_parts[0].lower() + + +def request(query, params): + + params['url'] = url + params['method'] = 'POST' + + params['data']['q'] = query + + # The API is not documented, so we do some reverse engineering and emulate + # what https://lite.duckduckgo.com/lite/ does when you press "next Page" + # link again and again .. + + params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' + + # initial page does not have an offset + if params['pageno'] == 2: + # second page does have an offset of 30 + offset = (params['pageno'] - 1) * 30 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + elif params['pageno'] > 2: + # third and following pages do have an offset of 30 + n*50 + offset = 30 + (params['pageno'] - 2) * 50 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + # initial page does not have additional data in the input form + if params['pageno'] > 1: + # request the second page (and more pages) needs 'o' and 'api' arguments + params['data']['o'] = 'json' + params['data']['api'] = 'd.js' + + # initial page does not have additional data in the input form + if params['pageno'] > 2: + # request the third page (and more pages) some more arguments + params['data']['nextParams'] = '' + params['data']['v'] = '' + params['data']['vqd'] = '' + + region_code = get_region_code(params['language'], supported_languages) + if region_code: + params['data']['kl'] = region_code + params['cookies']['kl'] = region_code + + params['data']['df'] = '' + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + params['cookies']['df'] = time_range_dict[params['time_range']] + + return params + + +# get response from search-request +def response(resp): + + headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) + get(url_ping, headers=headers_ping) + + if resp.status_code == 303: + return [] + + results = [] + doc = fromstring(resp.text) + + result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') + if not len(result_table) >= 3: + # no more results + return [] + result_table = result_table[2] + + tr_rows = eval_xpath(result_table, './/tr') + + # In the last is the form of the 'previous/next page' links + tr_rows = tr_rows[:-1] + + len_tr_rows = len(tr_rows) + offset = 0 + + while len_tr_rows >= offset + 4: + + # assemble table rows we need to scrap + tr_title = tr_rows[offset] + tr_content = tr_rows[offset + 1] + offset += 4 + + # ignore sponsored Adds + if tr_content.get('class') == 'result-sponsored': + continue + + a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) + if a_tag is None: + continue + + td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) + if td_content is None: + continue + + results.append({ + 'title': a_tag.text_content(), + 'content': extract_text(td_content), + 'url': a_tag.get('href'), + }) + + return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + + # response is a js file with regions as an embedded object + response_page = resp.text + response_page = response_page[response_page.find('regions:{') + 8:] + response_page = response_page[:response_page.find('}') + 1] + + regions_json = loads(response_page) + supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) + + return list(supported_languages) diff --git a/searx/settings.yml b/searx/settings.yml index a869bba4f..e8fe6aa06 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -415,6 +415,10 @@ engines: engine : duckduckgo shortcut : ddg + - name : duckduckgo (lite) + engine : duckduckgo_lite + shortcut : ddgl + - name : duckduckgo images engine : duckduckgo_images shortcut : ddi -- GitLab From 74624625e6e4d1c63c62c80caf527176718c85cd Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:12:48 -0300 Subject: [PATCH 08/20] Update ddg url and add safesearch --- searx/engines/duckduckgo.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index c8653e916..f93c7815f 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -23,6 +23,7 @@ categories = ['general'] paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True +safesearch = True language_aliases = { 'ar-SA': 'ar-XA', @@ -35,7 +36,7 @@ language_aliases = { } # search-url -url = 'https://html.duckduckgo.com/html/?q={}' +url = 'https://duckduckgo.com/?q={}' url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', @@ -67,10 +68,19 @@ def request(query, params): return params params['url'] = url.format(query) - params['method'] = 'POST' + params['method'] = 'GET' params['data']['q'] = query params['data']['b'] = '' + safesearch_ddg_value = None + if params['safesearch'] == 0: + safesearch_ddg_value = -2 # OFF + if params['safesearch'] == 2: + safesearch_ddg_value = 1 # STRICT + + if safesearch_ddg_value != None: + params['cookies']['p'] = str(safesearch_ddg_value) + region_code = get_region_code(params['language'], supported_languages) if region_code: params['data']['kl'] = region_code @@ -90,15 +100,14 @@ def response(resp): # parse the response results = [] + doc = fromstring(resp.text) - + titles = eval_xpath(doc, title_xpath) contents = eval_xpath(doc, content_xpath) - urls = eval_xpath(doc, url_xpath) + urls = eval_xpath(doc, url_xpath) for title, content, url in zip(titles, contents, urls): - print(extract_text(content)) - results.append({'title': extract_text(title), 'content': extract_text(content), 'url': url}) -- GitLab From 0579cfb31f2313d6b20e193a1a2e97a044aa3902 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:22:05 -0300 Subject: [PATCH 09/20] Fix small pep8 issues --- searx/engines/duckduckgo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f93c7815f..a1055767f 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -74,11 +74,11 @@ def request(query, params): safesearch_ddg_value = None if params['safesearch'] == 0: - safesearch_ddg_value = -2 # OFF + safesearch_ddg_value = -2 # OFF if params['safesearch'] == 2: - safesearch_ddg_value = 1 # STRICT + safesearch_ddg_value = 1 # STRICT - if safesearch_ddg_value != None: + if safesearch_ddg_value is not None: params['cookies']['p'] = str(safesearch_ddg_value) region_code = get_region_code(params['language'], supported_languages) -- GitLab From c70198ae2971b6dbea0568726d7194fa971ca362 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 3 Jan 2022 15:45:33 -0300 Subject: [PATCH 10/20] Remove unused imports --- searx/engines/duckduckgo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index a1055767f..e2b8ea4c8 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -5,8 +5,7 @@ from lxml.html import fromstring from json import loads -from searx.utils import extract_text, match_language, eval_xpath, dict_subset -from searx.network import get +from searx.utils import extract_text, match_language, eval_xpath # about about = { -- GitLab From b562bce5fbe1619fa5da45d0304820ddf779df6d Mon Sep 17 00:00:00 2001 From: nivesh Date: Fri, 11 Feb 2022 01:35:33 +0530 Subject: [PATCH 11/20] inital version of safe search for ddg engine --- searx/engines/duckduckgo.py | 116 +++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 41 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index e2b8ea4c8..80d18c80d 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -6,7 +6,10 @@ from lxml.html import fromstring from json import loads from searx.utils import extract_text, match_language, eval_xpath +from searx import logger +import re +logger = logger.getChild('ddg engine') # about about = { "website": 'https://duckduckgo.com/', @@ -23,7 +26,7 @@ paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True safesearch = True - +VQD_REGEX = r"vqd='(\d+-\d+-\d+)'/"; language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -35,21 +38,14 @@ language_aliases = { } # search-url -url = 'https://duckduckgo.com/?q={}' +url = 'https://links.duckduckgo.com/d.js?' + url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} -# specific xpath variables -result_xpath = '//div[@class="links_main links_deep result__body"]' # noqa -url_xpath = '//a[@class="result__snippet"]/@href' -title_xpath = '//a[@class="result__a"]' -content_xpath = '//a[@class="result__snippet"]' -correction_xpath = '//a[@id="js-spelling-recourse-link"]' - - # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=None): if lang == 'all': @@ -61,34 +57,65 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() +# def get_vqd(query): +# resp = requests.get def request(query, params): if params['time_range'] is not None and params['time_range'] not in time_range_dict: return params - params['url'] = url.format(query) params['method'] = 'GET' - params['data']['q'] = query - params['data']['b'] = '' - - safesearch_ddg_value = None - if params['safesearch'] == 0: - safesearch_ddg_value = -2 # OFF - if params['safesearch'] == 2: - safesearch_ddg_value = 1 # STRICT - if safesearch_ddg_value is not None: - params['cookies']['p'] = str(safesearch_ddg_value) - - region_code = get_region_code(params['language'], supported_languages) - if region_code: - params['data']['kl'] = region_code - params['cookies']['kl'] = region_code - - if params['time_range'] in time_range_dict: - params['data']['df'] = time_range_dict[params['time_range']] + logger.debug(params) + + query_dict = { + "q": query, + 't': 'D', + 'l': params["language"], + 'kl': get_region_code(params["language"]), + 's': 0, # TODO + 'dl': 'en', + 'ct': 'US', + 'ss_mkt': get_region_code(params["language"]), + 'df': params['time_range'], + 'vqd' : "3-126340648549743517691069464246778236175-203846832012815914858366468471688211061", + 'ex': -2, + 'sp': '1', + 'bpa': '1', + 'biaexp': 'b', + 'msvrtexp': 'b' + } + if params['safesearch'] == 2: # STRICT + del query_dict['t'] + query_dict['p'] = 1 + query_dict.update({ + 'videxp': 'a', + 'nadse': 'b', + 'eclsexp': 'a', + 'stiaexp': 'a', + 'tjsexp': 'b', + 'related': 'b', + 'msnexp': 'a' + }) + elif params['safesearch'] == 1: # MODERATE + query_dict['ex'] = -1 + query_dict.update({ + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' + }) + else: # OFF + query_dict['ex'] = -2 + query_dict.update({ + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' + }) params['allow_redirects'] = False + params["data"] = query_dict + params["url"] = url + logger.debug(params) return params @@ -101,22 +128,29 @@ def response(resp): results = [] doc = fromstring(resp.text) - - titles = eval_xpath(doc, title_xpath) - contents = eval_xpath(doc, content_xpath) - urls = eval_xpath(doc, url_xpath) - - for title, content, url in zip(titles, contents, urls): - results.append({'title': extract_text(title), - 'content': extract_text(content), - 'url': url}) + data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text)) + search_data = loads(data[0].replace('/\t/g', ' ')) + + if len(search_data) == 1 and ('n' not in search_data[0]): + only_result = search_data[0] + if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or only_result.get('a') is not None or only_result.get('d') == 'google.com search'): + return + + + for search_result in search_data: + if 'n' in search_result: + continue + results.append({'title': search_result.get("t"), + 'content': extract_text(search_result.get('a')), + 'url': search_result.get('u')}) # parse correction - for correction in eval_xpath(doc, correction_xpath): - # append correction - results.append({'correction': extract_text(correction)}) + # for correction in eval_xpath(doc, correction_xpath): + # # append correction + # results.append({'correction': extract_text(correction)}) # return results + logger.debug(results) return results -- GitLab From c7d28ead4bde5293720a1d8f7749347a9df95b6e Mon Sep 17 00:00:00 2001 From: nivesh Date: Tue, 15 Feb 2022 01:14:01 +0530 Subject: [PATCH 12/20] remove debug logs --- searx/engines/duckduckgo.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 80d18c80d..f52edf22a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -8,6 +8,7 @@ from json import loads from searx.utils import extract_text, match_language, eval_xpath from searx import logger import re +import httpx logger = logger.getChild('ddg engine') # about @@ -26,7 +27,7 @@ paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True safesearch = True -VQD_REGEX = r"vqd='(\d+-\d+-\d+)'/"; +VQD_REGEX = r"vqd='(\d+-\d+-\d+)'"; language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -57,8 +58,10 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() -# def get_vqd(query): -# resp = requests.get +def get_vqd(query): + resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") + resp = re.findall(VQD_REGEX, resp.text) + return resp[0] def request(query, params): if params['time_range'] is not None and params['time_range'] not in time_range_dict: @@ -66,8 +69,7 @@ def request(query, params): params['method'] = 'GET' - logger.debug(params) - + vqd = get_vqd(query) query_dict = { "q": query, 't': 'D', @@ -78,7 +80,7 @@ def request(query, params): 'ct': 'US', 'ss_mkt': get_region_code(params["language"]), 'df': params['time_range'], - 'vqd' : "3-126340648549743517691069464246778236175-203846832012815914858366468471688211061", + 'vqd' : vqd, 'ex': -2, 'sp': '1', 'bpa': '1', @@ -115,7 +117,6 @@ def request(query, params): params['allow_redirects'] = False params["data"] = query_dict params["url"] = url - logger.debug(params) return params @@ -149,8 +150,6 @@ def response(resp): # # append correction # results.append({'correction': extract_text(correction)}) - # return results - logger.debug(results) return results -- GitLab From 085f08bd2b6ba634cec05386750b7ff971783dfb Mon Sep 17 00:00:00 2001 From: Nivesh Date: Mon, 28 Feb 2022 15:40:30 +0530 Subject: [PATCH 13/20] fix pep8 --- searx/engines/duckduckgo.py | 57 +++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f52edf22a..4a109d2c8 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -3,9 +3,8 @@ DuckDuckGo (Web) """ -from lxml.html import fromstring from json import loads -from searx.utils import extract_text, match_language, eval_xpath +from searx.utils import extract_text, match_language from searx import logger import re import httpx @@ -27,7 +26,7 @@ paging = False supported_languages_url = 'https://duckduckgo.com/util/u172.js' time_range_support = True safesearch = True -VQD_REGEX = r"vqd='(\d+-\d+-\d+)'"; +VQD_REGEX = r"vqd='(\d+-\d+-\d+)'" language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -47,6 +46,7 @@ time_range_dict = {'day': 'd', 'month': 'm', 'year': 'y'} + # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=None): if lang == 'all': @@ -58,11 +58,13 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() + def get_vqd(query): resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") resp = re.findall(VQD_REGEX, resp.text) return resp[0] + def request(query, params): if params['time_range'] is not None and params['time_range'] not in time_range_dict: return params @@ -75,43 +77,43 @@ def request(query, params): 't': 'D', 'l': params["language"], 'kl': get_region_code(params["language"]), - 's': 0, # TODO + 's': 0, 'dl': 'en', 'ct': 'US', 'ss_mkt': get_region_code(params["language"]), 'df': params['time_range'], - 'vqd' : vqd, + 'vqd': vqd, 'ex': -2, 'sp': '1', 'bpa': '1', 'biaexp': 'b', 'msvrtexp': 'b' } - if params['safesearch'] == 2: # STRICT + if params['safesearch'] == 2: # STRICT del query_dict['t'] query_dict['p'] = 1 query_dict.update({ - 'videxp': 'a', - 'nadse': 'b', - 'eclsexp': 'a', - 'stiaexp': 'a', - 'tjsexp': 'b', - 'related': 'b', - 'msnexp': 'a' - }) - elif params['safesearch'] == 1: # MODERATE + 'videxp': 'a', + 'nadse': 'b', + 'eclsexp': 'a', + 'stiaexp': 'a', + 'tjsexp': 'b', + 'related': 'b', + 'msnexp': 'a' + }) + elif params['safesearch'] == 1: # MODERATE query_dict['ex'] = -1 query_dict.update({ - 'nadse': 'b', - 'eclsexp': 'b', - 'tjsexp': 'b' + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' }) - else: # OFF + else: # OFF query_dict['ex'] = -2 query_dict.update({ - 'nadse': 'b', - 'eclsexp': 'b', - 'tjsexp': 'b' + 'nadse': 'b', + 'eclsexp': 'b', + 'tjsexp': 'b' }) params['allow_redirects'] = False @@ -128,15 +130,14 @@ def response(resp): # parse the response results = [] - doc = fromstring(resp.text) data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text)) search_data = loads(data[0].replace('/\t/g', ' ')) if len(search_data) == 1 and ('n' not in search_data[0]): only_result = search_data[0] - if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or only_result.get('a') is not None or only_result.get('d') == 'google.com search'): + if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or + only_result.get('a') is not None or only_result.get('d') == 'google.com search'): return - for search_result in search_data: if 'n' in search_result: @@ -144,12 +145,6 @@ def response(resp): results.append({'title': search_result.get("t"), 'content': extract_text(search_result.get('a')), 'url': search_result.get('u')}) - - # parse correction - # for correction in eval_xpath(doc, correction_xpath): - # # append correction - # results.append({'correction': extract_text(correction)}) - return results -- GitLab From 594403440528040d89fccba5cbf0e851472ac7cf Mon Sep 17 00:00:00 2001 From: nivesh Date: Thu, 17 Mar 2022 18:40:26 +0530 Subject: [PATCH 14/20] fix paging and offset in ddg engine --- searx/engines/duckduckgo.py | 5 +++-- searx/engines/jstest.js | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 searx/engines/jstest.js diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 70f1a8f65..876959dcd 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -22,8 +22,9 @@ about = { # engine dependent config categories = ['general'] -paging = False +paging = True supported_languages_url = 'https://duckduckgo.com/util/u172.js' +number_of_results = 10 time_range_support = True safesearch = True VQD_REGEX = r"vqd='(\d+-\d+-\d+)'" @@ -86,7 +87,7 @@ def request(query, params): 't': 'D', 'l': params["language"], 'kl': get_region_code(params["language"]), - 's': 0, + 's': (params['pageno'] - 1) * number_of_results, 'dl': 'en', 'ct': 'US', 'ss_mkt': get_region_code(params["language"]), diff --git a/searx/engines/jstest.js b/searx/engines/jstest.js new file mode 100644 index 000000000..46fccaf2b --- /dev/null +++ b/searx/engines/jstest.js @@ -0,0 +1,24 @@ +const DDG = require('duck-duck-scrape'); +// let x = DDG.search('fuck', { +// safeSearch: DDG.SafeSearchType.STRICT +// }).then((data) => { +// data["results"].forEach(element => { +// console.log(element.title, 'STRICT') +// }); +// }); + +// let y = DDG.search('fuck', { +// safeSearch: DDG.SafeSearchType.MODERATE +// }).then((data) => { +// data["results"].forEach(element => { +// console.log(element.title, 'MODERATE') +// }); +// }); + +let z = DDG.search('fuck', { + safeSearch: DDG.SafeSearchType.OFF +}).then((data) => { + data["results"].forEach(element => { + console.log(element.title, 'OFF') + }); +}); -- GitLab From f2fc86f49ba23799ade566964cd050e9a7bc80be Mon Sep 17 00:00:00 2001 From: nivesh Date: Thu, 17 Mar 2022 18:40:51 +0530 Subject: [PATCH 15/20] remove test file --- searx/engines/jstest.js | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 searx/engines/jstest.js diff --git a/searx/engines/jstest.js b/searx/engines/jstest.js deleted file mode 100644 index 46fccaf2b..000000000 --- a/searx/engines/jstest.js +++ /dev/null @@ -1,24 +0,0 @@ -const DDG = require('duck-duck-scrape'); -// let x = DDG.search('fuck', { -// safeSearch: DDG.SafeSearchType.STRICT -// }).then((data) => { -// data["results"].forEach(element => { -// console.log(element.title, 'STRICT') -// }); -// }); - -// let y = DDG.search('fuck', { -// safeSearch: DDG.SafeSearchType.MODERATE -// }).then((data) => { -// data["results"].forEach(element => { -// console.log(element.title, 'MODERATE') -// }); -// }); - -let z = DDG.search('fuck', { - safeSearch: DDG.SafeSearchType.OFF -}).then((data) => { - data["results"].forEach(element => { - console.log(element.title, 'OFF') - }); -}); -- GitLab From ff1b14438485f7ce4a25fc7ac27edf2bc763a38e Mon Sep 17 00:00:00 2001 From: nivesh Date: Thu, 17 Mar 2022 18:57:16 +0530 Subject: [PATCH 16/20] fix lint and pep issues --- searx/engines/duckduckgo.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 876959dcd..6b8f763c3 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -53,6 +53,7 @@ time_range_dict = {'day': 'd', 'month': 'm', 'year': 'y'} + # match query's language to a region code that duckduckgo will accept def get_region_code(lang, lang_list=None): if lang == 'all': @@ -64,10 +65,6 @@ def get_region_code(lang, lang_list=None): # country code goes first return lang_parts[1].lower() + '-' + lang_parts[0].lower() -def get_vqd(query): - resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") - resp = re.findall(VQD_REGEX, resp.text) - return resp[0] def get_vqd(query): resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") -- GitLab From ce6700dfb53081d7d2156327d70220f447df489d Mon Sep 17 00:00:00 2001 From: Nivesh Krishna Date: Wed, 23 Mar 2022 16:42:43 +0530 Subject: [PATCH 17/20] fix language selection in ddg engine --- searx/engines/duckduckgo.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 6b8f763c3..12cbae6ab 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -4,10 +4,11 @@ """ from json import loads -from searx.utils import extract_text, match_language +from urllib.parse import urlencode +from searx.utils import match_language, HTMLTextExtractor from searx import logger import re -import httpx +from searx.network import get logger = logger.getChild('ddg engine') # about @@ -66,8 +67,8 @@ def get_region_code(lang, lang_list=None): return lang_parts[1].lower() + '-' + lang_parts[0].lower() -def get_vqd(query): - resp = httpx.get(f"https://duckduckgo.com/?q={query}&ia=web") +def get_vqd(query, headers): + resp = get(f"https://duckduckgo.com/?q={query}&ia=web", headers=headers) resp = re.findall(VQD_REGEX, resp.text) return resp[0] @@ -78,16 +79,17 @@ def request(query, params): params['method'] = 'GET' - vqd = get_vqd(query) + vqd = get_vqd(query, params["headers"]) + dl,ct = match_language(params["language"], supported_languages, language_aliases, 'wt-WT').split("-") query_dict = { "q": query, 't': 'D', 'l': params["language"], - 'kl': get_region_code(params["language"]), + 'kl': f"{ct}-{dl}", 's': (params['pageno'] - 1) * number_of_results, - 'dl': 'en', - 'ct': 'US', - 'ss_mkt': get_region_code(params["language"]), + 'dl': dl, + 'ct': ct, + 'ss_mkt': get_region_code(params["language"], supported_languages), 'df': params['time_range'], 'vqd': vqd, 'ex': -2, @@ -125,7 +127,8 @@ def request(query, params): params['allow_redirects'] = False params["data"] = query_dict - params["url"] = url + params['cookies']['kl'] = params["data"]["kl"] + params["url"] = url + urlencode(params["data"]) return params @@ -138,19 +141,24 @@ def response(resp): results = [] data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text)) - search_data = loads(data[0].replace('/\t/g', ' ')) + try: + search_data = loads(data[0].replace('/\t/g', ' ')) + except IndexError: + return if len(search_data) == 1 and ('n' not in search_data[0]): only_result = search_data[0] if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or only_result.get('a') is not None or only_result.get('d') == 'google.com search'): return - + for search_result in search_data: if 'n' in search_result: continue + html2text = HTMLTextExtractor() + html2text.feed(search_result.get('a')) results.append({'title': search_result.get("t"), - 'content': extract_text(search_result.get('a')), + 'content': html2text.get_text(), 'url': search_result.get('u')}) return results -- GitLab From 37e5148b535b97f686eb7b04d4559cd6b52c3f4c Mon Sep 17 00:00:00 2001 From: Nivesh Krishna Date: Wed, 23 Mar 2022 16:48:27 +0530 Subject: [PATCH 18/20] fix time range in ddg engine --- searx/engines/duckduckgo.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 12cbae6ab..f6ec4385a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -41,13 +41,6 @@ language_aliases = { # search-url url = 'https://links.duckduckgo.com/d.js?' - -url_ping = 'https://duckduckgo.com/t/sl_h' -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm', - 'year': 'y'} - url_ping = 'https://duckduckgo.com/t/sl_h' time_range_dict = {'day': 'd', 'week': 'w', @@ -74,8 +67,6 @@ def get_vqd(query, headers): def request(query, params): - if params['time_range'] is not None and params['time_range'] not in time_range_dict: - return params params['method'] = 'GET' @@ -128,6 +119,9 @@ def request(query, params): params['allow_redirects'] = False params["data"] = query_dict params['cookies']['kl'] = params["data"]["kl"] + if params['time_range'] in time_range_dict: + params['data']['df'] = time_range_dict[params['time_range']] + params['cookies']['df'] = time_range_dict[params['time_range']] params["url"] = url + urlencode(params["data"]) return params -- GitLab From 30665b6aed7d642aa292fd84c2441d4d4113dda2 Mon Sep 17 00:00:00 2001 From: Nivesh Krishna Date: Wed, 23 Mar 2022 18:28:04 +0530 Subject: [PATCH 19/20] fix pep8 issues --- searx/engines/duckduckgo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f6ec4385a..8878522f0 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -71,7 +71,7 @@ def request(query, params): params['method'] = 'GET' vqd = get_vqd(query, params["headers"]) - dl,ct = match_language(params["language"], supported_languages, language_aliases, 'wt-WT').split("-") + dl, ct = match_language(params["language"], supported_languages, language_aliases, 'wt-WT').split("-") query_dict = { "q": query, 't': 'D', @@ -118,7 +118,7 @@ def request(query, params): params['allow_redirects'] = False params["data"] = query_dict - params['cookies']['kl'] = params["data"]["kl"] + params['cookies']['kl'] = params["data"]["kl"] if params['time_range'] in time_range_dict: params['data']['df'] = time_range_dict[params['time_range']] params['cookies']['df'] = time_range_dict[params['time_range']] @@ -145,7 +145,7 @@ def response(resp): if ((only_result.get("da") is not None and only_result.get("t") == 'EOF') or only_result.get('a') is not None or only_result.get('d') == 'google.com search'): return - + for search_result in search_data: if 'n' in search_result: continue -- GitLab From 4faebacd7b92d5cb6b8acca636894a404da90d4e Mon Sep 17 00:00:00 2001 From: Nivesh Krishna Date: Wed, 23 Mar 2022 19:50:51 +0530 Subject: [PATCH 20/20] remove ddg lite from default engines --- searx/settings.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/searx/settings.yml b/searx/settings.yml index e8fe6aa06..255b41c7a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -418,6 +418,7 @@ engines: - name : duckduckgo (lite) engine : duckduckgo_lite shortcut : ddgl + disabled : True - name : duckduckgo images engine : duckduckgo_images -- GitLab