From 22863c3aa0f2d122072d5f6cd20297c79ca43865 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira <26233246+israelyago@users.noreply.github.com> Date: Thu, 27 Jan 2022 15:21:41 -0300 Subject: [PATCH 1/3] Fix digg engine --- searx/engines/digg.py | 44 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index defcacd20..00a72b3b6 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -9,6 +9,7 @@ from urllib.parse import urlencode from datetime import datetime from lxml import html +from searx.utils import eval_xpath, extract_text # about about = { @@ -22,48 +23,41 @@ about = { # engine dependent config categories = ['news', 'social media'] -paging = True +paging = False base_url = 'https://digg.com' # search-url search_url = base_url + ( - '/api/search/' + '/search' '?{query}' - '&from={position}' - '&size=20' - '&format=html' ) def request(query, params): - offset = (params['pageno'] - 1) * 20 params['url'] = search_url.format( query = urlencode({'q': query}), - position = offset, ) return params def response(resp): results = [] - # parse results - for result in loads(resp.text)['mapped']: + dom = html.fromstring(resp.text) - # strip html tags and superfluous quotation marks from content - content = html.document_fromstring( - result['excerpt'] - ).text_content() + results_list = eval_xpath(dom, '//section[contains(@class, "search-results")]') - # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} - published = datetime.strptime( - result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ' - ) - results.append({ - 'url': result['url'], - 'title': result['title'], - 'content' : content, - 'template': 'videos.html', - 'publishedDate': published, - 'thumbnail': result['images']['thumbImage'], - }) + for result in results_list: + + titles = eval_xpath(result, '//article//header//h2') + contents = eval_xpath(result, '//article//p') + urls = eval_xpath(result, '//header/a/@href') + published_dates = eval_xpath(result, '//article/div/div/time/@datetime') + + for (title, content, url, published_date) in zip(titles, contents, urls, published_dates): + results.append({ + 'url': url, + 'publishedDate': datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ'), + 'title': extract_text(title), + 'content' : extract_text(content), + }) return results -- GitLab From 6f1e27cd0ee9694262fdb44b98218fb85aefb349 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira <26233246+israelyago@users.noreply.github.com> Date: Thu, 27 Jan 2022 15:56:02 -0300 Subject: [PATCH 2/3] Remove unused import --- searx/engines/digg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 00a72b3b6..0fe021815 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -4,7 +4,6 @@ """ # pylint: disable=missing-function-docstring -from json import loads from urllib.parse import urlencode from datetime import datetime -- GitLab From 7a833ad9f9a8c6a2281f34e168d06ca391907401 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira <26233246+israelyago@users.noreply.github.com> Date: Fri, 28 Jan 2022 09:20:49 -0300 Subject: [PATCH 3/3] Adding digg paging --- searx/engines/digg.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 0fe021815..4fde49116 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -22,18 +22,24 @@ about = { # engine dependent config categories = ['news', 'social media'] -paging = False +paging = True base_url = 'https://digg.com' +results_per_page = 10 # search-url search_url = base_url + ( '/search' '?{query}' + '&size={size}' + '&offset={offset}' ) def request(query, params): + offset = (params['pageno'] - 1) * results_per_page + 1 params['url'] = search_url.format( query = urlencode({'q': query}), + size = results_per_page, + offset = offset, ) return params -- GitLab