From 78556dd8f4a3b63b1d22866576482456675384a7 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 29 Oct 2021 17:02:14 -0300 Subject: [PATCH 1/5] WIP: onesearch engine --- searx/settings.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index d58b86035..f2c93386f 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1631,6 +1631,22 @@ engines: require_api_key: false results: HTML + - name: onesearch + shortcut: onesearch + engine: xpath + paging: false + search_url: https://www.onesearch.com/yhs/search;?p={query} + url_xpath: //div[contains(@class, "algo")]//h3[contains(@class, "title")]/a/@href + title_xpath: //div[contains(@class, "algo")]//h3[contains(@class, "title")] + content_xpath: //div[contains(@class, "algo")]/div[contains(@class, "compText")]/p//text() + categories: general + about: + website: https://www.onesearch.com/ + wikidata_id: None + use_official_api: false + require_api_key: false + results: HTML + # Doku engine lets you access to any Doku wiki instance: # A public one or a private/corporate one. # - name : ubuntuwiki -- GitLab From fa8874b68e77578002f68d13b35faf7af87e0fa6 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 1 Nov 2021 16:17:01 -0300 Subject: [PATCH 2/5] Onesearch engine without pagination --- searx/engines/onesearch.py | 56 ++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +--- 2 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 searx/engines/onesearch.py diff --git a/searx/engines/onesearch.py b/searx/engines/onesearch.py new file mode 100644 index 000000000..19fabe421 --- /dev/null +++ b/searx/engines/onesearch.py @@ -0,0 +1,56 @@ + +"""Onesearch +""" + +from lxml.html import fromstring + +import re + +from searx.utils import ( + eval_xpath, + extract_text, +) + +from urllib.parse import unquote + +# about +about = { + "website": 'https://www.onesearch.com/', + "wikidata_id": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['general'] + +# search-url +URL = 'https://www.onesearch.com/yhs/search;?p=%s' + +def request(query, params): + params['url'] = URL % query + return params + + +# get response from search-request +def response(resp): + + results = [] + doc = fromstring(resp.text) + + titles_tags = eval_xpath(doc, '//div[contains(@class, "algo")]//h3[contains(@class, "title")]') + contents = eval_xpath(doc, '//div[contains(@class, "algo")]/div[contains(@class, "compText")]/p') + onesearch_urls = eval_xpath(doc, '//div[contains(@class, "algo")]//h3[contains(@class, "title")]/a/@href') + + for title_tag, content, onesearch_url in zip(titles_tags, contents, onesearch_urls): + print(f"{title_tag.text_content()} ---> {onesearch_url}") + matches = re.search(r'RU=(.*?)\/', onesearch_url) + results.append({ + 'title': title_tag.text_content(), + 'content': extract_text(content), + 'url': unquote(matches.group(1)), + }) + + return results + diff --git a/searx/settings.yml b/searx/settings.yml index f2c93386f..60084033a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1633,12 +1633,8 @@ engines: - name: onesearch shortcut: onesearch - engine: xpath + engine: onesearch paging: false - search_url: https://www.onesearch.com/yhs/search;?p={query} - url_xpath: //div[contains(@class, "algo")]//h3[contains(@class, "title")]/a/@href - title_xpath: //div[contains(@class, "algo")]//h3[contains(@class, "title")] - content_xpath: //div[contains(@class, "algo")]/div[contains(@class, "compText")]/p//text() categories: general about: website: https://www.onesearch.com/ -- GitLab From 6d1188dfcc2ae594f307f8bc60c0b010961edca1 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 1 Nov 2021 16:45:45 -0300 Subject: [PATCH 3/5] Fix code style --- searx/engines/onesearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/onesearch.py b/searx/engines/onesearch.py index 19fabe421..7e2f29136 100644 --- a/searx/engines/onesearch.py +++ b/searx/engines/onesearch.py @@ -28,6 +28,7 @@ categories = ['general'] # search-url URL = 'https://www.onesearch.com/yhs/search;?p=%s' + def request(query, params): params['url'] = URL % query return params @@ -53,4 +54,3 @@ def response(resp): }) return results - -- GitLab From 2a234ee100ad2cb83fb8c6459fc95899d545b0ca Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Tue, 2 Nov 2021 13:41:20 -0300 Subject: [PATCH 4/5] Onesearch pagination --- searx/engines/onesearch.py | 6 ++++-- searx/settings.yml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/searx/engines/onesearch.py b/searx/engines/onesearch.py index 7e2f29136..17a009c21 100644 --- a/searx/engines/onesearch.py +++ b/searx/engines/onesearch.py @@ -24,13 +24,15 @@ about = { # engine dependent config categories = ['general'] +paging = True # search-url -URL = 'https://www.onesearch.com/yhs/search;?p=%s' +URL = 'https://www.onesearch.com/yhs/search;?p=%s&b=%d' def request(query, params): - params['url'] = URL % query + starting_from = (params['pageno'] * 10) - 9 + params['url'] = URL % (query, starting_from) return params diff --git a/searx/settings.yml b/searx/settings.yml index 60084033a..01d00df75 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1634,7 +1634,7 @@ engines: - name: onesearch shortcut: onesearch engine: onesearch - paging: false + paging: true categories: general about: website: https://www.onesearch.com/ -- GitLab From 4ab9199056d82455d407d75ae62fdb9a6d9c16a9 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 11 Nov 2021 08:54:35 -0300 Subject: [PATCH 5/5] Fix onesearch logs --- searx/engines/onesearch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/searx/engines/onesearch.py b/searx/engines/onesearch.py index 17a009c21..dc3cc1af0 100644 --- a/searx/engines/onesearch.py +++ b/searx/engines/onesearch.py @@ -47,7 +47,6 @@ def response(resp): onesearch_urls = eval_xpath(doc, '//div[contains(@class, "algo")]//h3[contains(@class, "title")]/a/@href') for title_tag, content, onesearch_url in zip(titles_tags, contents, onesearch_urls): - print(f"{title_tag.text_content()} ---> {onesearch_url}") matches = re.search(r'RU=(.*?)\/', onesearch_url) results.append({ 'title': title_tag.text_content(), -- GitLab