Loading Makefile +2 −1 Original line number Diff line number Diff line Loading @@ -223,6 +223,7 @@ test.pylint: pyenvinstall $(call cmd,pylint,\ searx/preferences.py \ searx/testing.py \ searx/engines/gigablast.py \ ) endif Loading @@ -243,7 +244,7 @@ test.sh: test.pep8: pyenvinstall @echo "TEST pep8" $(Q)$(PY_ENV_ACT); pep8 --exclude=searx/static --max-line-length=120 --ignore "E402,W503" searx tests $(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/engines/gigablast.py' --max-line-length=120 --ignore "E402,W503" searx tests test.unit: pyenvinstall @echo "TEST tests/unit" Loading searx/engines/gigablast.py +82 −85 Original line number Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later """ Gigablast (Web) Loading @@ -9,121 +10,117 @@ @stable yes @parse url, title, content """ # pylint: disable=missing-function-docstring, invalid-name import random import re from json import loads from time import time from lxml.html import fromstring from searx.poolrequests import get # from searx import logger from searx.url_utils import urlencode from searx.utils import eval_xpath from searx.poolrequests import get # engine dependent config categories = ['general'] paging = True number_of_results = 10 # gigablast's pagination is totally damaged, don't use it paging = False language_support = True safesearch = True # search-url base_url = 'https://gigablast.com/' search_string = 'search?{query}'\ '&n={number_of_results}'\ '&c=main'\ '&s={offset}'\ '&format=json'\ '&langcountry={lang}'\ '&ff={safesearch}'\ '&rand={rxikd}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' title_xpath = './/title' content_xpath = './/sum' supported_languages_url = 'https://gigablast.com/search?&rxikd=1' extra_param = '' # gigablast requires a random extra parameter # which can be extracted from the source code of the search page base_url = 'https://gigablast.com' def parse_extra_param(text): global extra_param param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')] # ugly hack: gigablast requires a random extra parameter which can be extracted # from the source code of the gigablast HTTP client extra_param = '' for l in param_lines: extra_param += l.split("'")[1] extra_param = extra_param.split('&')[-1] extra_param_path='/search?c=main&qlangcountry=en-us&q=south&s=10' def parse_extra_param(text): def init(engine_settings=None): parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text) # example: # # var uxrl='/search?c=main&qlangcountry=en-us&q=south&s=10&rand=1590740241635&n'; # uxrl=uxrl+'sab=730863287'; # # extra_param --> "rand=1590740241635&nsab=730863287" global extra_param # pylint: disable=global-statement re_var= None for line in text.splitlines(): if re_var is None and extra_param_path in line: var = line.split("=")[0].split()[1] # e.g. var --> 'uxrl' re_var = re.compile(var + "\\s*=\\s*" + var + "\\s*\\+\\s*'" + "(.*)" + "'(.*)") extra_param = line.split("'")[1][len(extra_param_path):] continue if re_var is not None and re_var.search(line): extra_param += re_var.search(line).group(1) break # logger.debug('gigablast extra_param="%s"', extra_param) def init(engine_settings=None): # pylint: disable=unused-argument parse_extra_param(get(base_url + extra_param_path).text) # do search-request def request(query, params): print("EXTRAPARAM:", extra_param) offset = (params['pageno'] - 1) * number_of_results def request(query, params): # pylint: disable=unused-argument if params['language'] == 'all': language = 'xx' else: language = params['language'].replace('-', '_').lower() if language.split('-')[0] != 'zh': language = language.split('-')[0] # see API http://www.gigablast.com/api.html#/search # Take into account, that the API has some quirks .. if params['safesearch'] >= 1: safesearch = 1 else: safesearch = 0 query_args = dict( c = 'main' , format = 'json' , q = query , dr = 1 , showgoodimages = 0 ) if params['language'] and params['language'] != 'all': query_args['qlangcountry'] = params['language'] query_args['qlang'] = params['language'].split('-')[0] # rxieu is some kind of hash from the search query, but accepts random atm search_path = search_string.format(query=urlencode({'q': query}), offset=offset, number_of_results=number_of_results, lang=language, rxikd=int(time() * 1000), safesearch=safesearch) if params['safesearch'] >= 1: query_args['ff'] = 1 params['url'] = base_url + search_path + '&' + extra_param search_url = '/search?' + urlencode(query_args) params['url'] = base_url + search_url + extra_param return params # get response from search-request def response(resp): results = [] # parse results try: response_json = loads(resp.text) except: parse_extra_param(resp.text) raise Exception('extra param expired, please reload') # logger.debug('gigablast returns %s results', len(response_json['results'])) for result in response_json['results']: # append result results.append({'url': result['url'], 'title': result['title'], 'content': result['sum']}) # see "Example JSON Output (&format=json)" # at http://www.gigablast.com/api.html#/search # return results return results # sort out meaningless result title = result.get('title') if len(title) < 2: continue url = result.get('url') if len(url) < 9: continue content = result.get('sum') if len(content) < 5: continue # get supported languages from their site def _fetch_supported_languages(resp): supported_languages = [] dom = fromstring(resp.text) links = eval_xpath(dom, '//span[@id="menu2"]/a') for link in links: href = eval_xpath(link, './@href')[0].split('lang%3A') if len(href) == 2: code = href[1].split('_') if len(code) == 2: code = code[0] + '-' + code[1].upper() else: code = code[0] supported_languages.append(code) return supported_languages # extend fields subtitle = result.get('title') if len(subtitle) > 3 and subtitle != title: title += " - " + subtitle results.append(dict( url = url , title = title , content = content )) return results utils/fetch_languages.py +2 −2 Original line number Diff line number Diff line Loading @@ -28,10 +28,10 @@ def fetch_supported_languages(): names.sort() for engine_name in names: print("fetching languages of engine %s" % engine_name) if hasattr(engines[engine_name], 'fetch_supported_languages'): engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() print("fetched %s languages from engine %s" % ( len(engines_languages[engine_name]), engine_name)) if type(engines_languages[engine_name]) == list: engines_languages[engine_name] = sorted(engines_languages[engine_name]) Loading Loading
Makefile +2 −1 Original line number Diff line number Diff line Loading @@ -223,6 +223,7 @@ test.pylint: pyenvinstall $(call cmd,pylint,\ searx/preferences.py \ searx/testing.py \ searx/engines/gigablast.py \ ) endif Loading @@ -243,7 +244,7 @@ test.sh: test.pep8: pyenvinstall @echo "TEST pep8" $(Q)$(PY_ENV_ACT); pep8 --exclude=searx/static --max-line-length=120 --ignore "E402,W503" searx tests $(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/engines/gigablast.py' --max-line-length=120 --ignore "E402,W503" searx tests test.unit: pyenvinstall @echo "TEST tests/unit" Loading
searx/engines/gigablast.py +82 −85 Original line number Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later """ Gigablast (Web) Loading @@ -9,121 +10,117 @@ @stable yes @parse url, title, content """ # pylint: disable=missing-function-docstring, invalid-name import random import re from json import loads from time import time from lxml.html import fromstring from searx.poolrequests import get # from searx import logger from searx.url_utils import urlencode from searx.utils import eval_xpath from searx.poolrequests import get # engine dependent config categories = ['general'] paging = True number_of_results = 10 # gigablast's pagination is totally damaged, don't use it paging = False language_support = True safesearch = True # search-url base_url = 'https://gigablast.com/' search_string = 'search?{query}'\ '&n={number_of_results}'\ '&c=main'\ '&s={offset}'\ '&format=json'\ '&langcountry={lang}'\ '&ff={safesearch}'\ '&rand={rxikd}' # specific xpath variables results_xpath = '//response//result' url_xpath = './/url' title_xpath = './/title' content_xpath = './/sum' supported_languages_url = 'https://gigablast.com/search?&rxikd=1' extra_param = '' # gigablast requires a random extra parameter # which can be extracted from the source code of the search page base_url = 'https://gigablast.com' def parse_extra_param(text): global extra_param param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')] # ugly hack: gigablast requires a random extra parameter which can be extracted # from the source code of the gigablast HTTP client extra_param = '' for l in param_lines: extra_param += l.split("'")[1] extra_param = extra_param.split('&')[-1] extra_param_path='/search?c=main&qlangcountry=en-us&q=south&s=10' def parse_extra_param(text): def init(engine_settings=None): parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text) # example: # # var uxrl='/search?c=main&qlangcountry=en-us&q=south&s=10&rand=1590740241635&n'; # uxrl=uxrl+'sab=730863287'; # # extra_param --> "rand=1590740241635&nsab=730863287" global extra_param # pylint: disable=global-statement re_var= None for line in text.splitlines(): if re_var is None and extra_param_path in line: var = line.split("=")[0].split()[1] # e.g. var --> 'uxrl' re_var = re.compile(var + "\\s*=\\s*" + var + "\\s*\\+\\s*'" + "(.*)" + "'(.*)") extra_param = line.split("'")[1][len(extra_param_path):] continue if re_var is not None and re_var.search(line): extra_param += re_var.search(line).group(1) break # logger.debug('gigablast extra_param="%s"', extra_param) def init(engine_settings=None): # pylint: disable=unused-argument parse_extra_param(get(base_url + extra_param_path).text) # do search-request def request(query, params): print("EXTRAPARAM:", extra_param) offset = (params['pageno'] - 1) * number_of_results def request(query, params): # pylint: disable=unused-argument if params['language'] == 'all': language = 'xx' else: language = params['language'].replace('-', '_').lower() if language.split('-')[0] != 'zh': language = language.split('-')[0] # see API http://www.gigablast.com/api.html#/search # Take into account, that the API has some quirks .. if params['safesearch'] >= 1: safesearch = 1 else: safesearch = 0 query_args = dict( c = 'main' , format = 'json' , q = query , dr = 1 , showgoodimages = 0 ) if params['language'] and params['language'] != 'all': query_args['qlangcountry'] = params['language'] query_args['qlang'] = params['language'].split('-')[0] # rxieu is some kind of hash from the search query, but accepts random atm search_path = search_string.format(query=urlencode({'q': query}), offset=offset, number_of_results=number_of_results, lang=language, rxikd=int(time() * 1000), safesearch=safesearch) if params['safesearch'] >= 1: query_args['ff'] = 1 params['url'] = base_url + search_path + '&' + extra_param search_url = '/search?' + urlencode(query_args) params['url'] = base_url + search_url + extra_param return params # get response from search-request def response(resp): results = [] # parse results try: response_json = loads(resp.text) except: parse_extra_param(resp.text) raise Exception('extra param expired, please reload') # logger.debug('gigablast returns %s results', len(response_json['results'])) for result in response_json['results']: # append result results.append({'url': result['url'], 'title': result['title'], 'content': result['sum']}) # see "Example JSON Output (&format=json)" # at http://www.gigablast.com/api.html#/search # return results return results # sort out meaningless result title = result.get('title') if len(title) < 2: continue url = result.get('url') if len(url) < 9: continue content = result.get('sum') if len(content) < 5: continue # get supported languages from their site def _fetch_supported_languages(resp): supported_languages = [] dom = fromstring(resp.text) links = eval_xpath(dom, '//span[@id="menu2"]/a') for link in links: href = eval_xpath(link, './@href')[0].split('lang%3A') if len(href) == 2: code = href[1].split('_') if len(code) == 2: code = code[0] + '-' + code[1].upper() else: code = code[0] supported_languages.append(code) return supported_languages # extend fields subtitle = result.get('title') if len(subtitle) > 3 and subtitle != title: title += " - " + subtitle results.append(dict( url = url , title = title , content = content )) return results
utils/fetch_languages.py +2 −2 Original line number Diff line number Diff line Loading @@ -28,10 +28,10 @@ def fetch_supported_languages(): names.sort() for engine_name in names: print("fetching languages of engine %s" % engine_name) if hasattr(engines[engine_name], 'fetch_supported_languages'): engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() print("fetched %s languages from engine %s" % ( len(engines_languages[engine_name]), engine_name)) if type(engines_languages[engine_name]) == list: engines_languages[engine_name] = sorted(engines_languages[engine_name]) Loading