Loading searx/engines/__init__.py +1 −162 Original line number Diff line number Diff line Loading @@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from os.path import realpath, dirname, splitext, join import sys from imp import load_source from itertools import izip_longest, chain from operator import itemgetter from urlparse import urlparse, unquote from datetime import datetime import grequests from flask.ext.babel import gettext from operator import itemgetter from searx import settings from searx.utils import gen_useragent engine_dir = dirname(realpath(__file__)) number_of_searches = 0 engines = {} categories = {'general': []} Loading Loading @@ -114,160 +107,6 @@ for engine_data in settings['engines']: engine_shortcuts[engine.shortcut] = engine.name def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} def make_callback(engine_name, results, suggestions, callback, params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() try: search_results = callback(response) except Exception, e: engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return for result in search_results: result['engine'] = engine_name if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue cb_res.append(result) results[engine_name] = cb_res return process_callback def score_results(results): flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) results = [] # deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) res['host'] = res['parsed_url'].netloc if res['host'].startswith('www.'): res['host'] = res['host'].replace('www.', '', 1) res['engines'] = [res['engine']] weight = 1.0 if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) score = int((flat_len - i) / engines_len) * weight + 1 duplicated = False for new_res in results: p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break if duplicated: if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] duplicated['score'] += score duplicated['engines'].append(res['engine']) if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] else: res['score'] = score results.append(res) return sorted(results, key=itemgetter('score'), reverse=True) def search(query, request, selected_engines, pageno=1, lang='all'): global engines, categories, number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in selected_engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if pageno > 1 and not engine.paging: continue if lang != 'all' and not engine.language_support: continue request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = pageno request_params['language'] = lang request_params = engine.request(query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']].stats['score_count'] += result['score'] return results, suggestions def get_engines_stats(): # TODO refactor pageloads = [] Loading searx/search.py +163 −0 Original line number Diff line number Diff line import grequests from itertools import izip_longest, chain from datetime import datetime from operator import itemgetter from urlparse import urlparse, unquote from searx.engines import ( categories, engines, engine_shortcuts ) from searx.languages import language_codes from searx.utils import gen_useragent number_of_searches = 0 def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} def make_callback(engine_name, results, suggestions, callback, params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() try: search_results = callback(response) except Exception, e: engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return for result in search_results: result['engine'] = engine_name if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue cb_res.append(result) results[engine_name] = cb_res return process_callback def score_results(results): flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) results = [] # deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) res['host'] = res['parsed_url'].netloc if res['host'].startswith('www.'): res['host'] = res['host'].replace('www.', '', 1) res['engines'] = [res['engine']] weight = 1.0 if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) score = int((flat_len - i) / engines_len) * weight + 1 duplicated = False for new_res in results: p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break if duplicated: if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] duplicated['score'] += score duplicated['engines'].append(res['engine']) if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] else: res['score'] = score results.append(res) return sorted(results, key=itemgetter('score'), reverse=True) class Search(object): Loading Loading @@ -112,3 +201,77 @@ class Search(object): if modified: self.query = self.query.replace(query_parts[0], '', 1).strip() self.parse_query() def search(self, request): global number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if self.pageno > 1 and not engine.paging: continue if self.lang != 'all' and not engine.language_support: continue request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = self.pageno request_params['language'] = self.lang request_params = engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] return results, suggestions searx/tests/test_webapp.py +4 −4 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ class ViewsTestCase(SearxTestCase): self.assertEqual(result.status_code, 200) self.assertIn('<div class="title"><h1>searx</h1></div>', result.data) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_html(self, search): search.return_value = ( self.test_results, Loading @@ -55,7 +55,7 @@ class ViewsTestCase(SearxTestCase): result.data ) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_json(self, search): search.return_value = ( self.test_results, Loading @@ -71,7 +71,7 @@ class ViewsTestCase(SearxTestCase): self.assertEqual( result_dict['results'][0]['url'], 'http://first.test.xyz') @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_csv(self, search): search.return_value = ( self.test_results, Loading @@ -86,7 +86,7 @@ class ViewsTestCase(SearxTestCase): result.data ) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_rss(self, search): search.return_value = ( self.test_results, Loading searx/webapp.py +2 −8 Original line number Diff line number Diff line Loading @@ -39,8 +39,7 @@ from flask import ( from flask.ext.babel import Babel, gettext, format_date from searx import settings, searx_dir from searx.engines import ( search as do_search, categories, engines, get_engines_stats, engine_shortcuts categories, engines, get_engines_stats, engine_shortcuts ) from searx.utils import ( UnicodeWriter, highlight_content, html_to_text, get_themes Loading Loading @@ -191,12 +190,7 @@ def index(): 'index.html', ) # TODO moar refactor - do_search integration into Search class search.results, search.suggestions = do_search(search.query, request, search.engines, search.pageno, search.lang) search.results, search.suggestions = search.search(request) for result in search.results: Loading Loading
searx/engines/__init__.py +1 −162 Original line number Diff line number Diff line Loading @@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from os.path import realpath, dirname, splitext, join import sys from imp import load_source from itertools import izip_longest, chain from operator import itemgetter from urlparse import urlparse, unquote from datetime import datetime import grequests from flask.ext.babel import gettext from operator import itemgetter from searx import settings from searx.utils import gen_useragent engine_dir = dirname(realpath(__file__)) number_of_searches = 0 engines = {} categories = {'general': []} Loading Loading @@ -114,160 +107,6 @@ for engine_data in settings['engines']: engine_shortcuts[engine.shortcut] = engine.name def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} def make_callback(engine_name, results, suggestions, callback, params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() try: search_results = callback(response) except Exception, e: engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return for result in search_results: result['engine'] = engine_name if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue cb_res.append(result) results[engine_name] = cb_res return process_callback def score_results(results): flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) results = [] # deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) res['host'] = res['parsed_url'].netloc if res['host'].startswith('www.'): res['host'] = res['host'].replace('www.', '', 1) res['engines'] = [res['engine']] weight = 1.0 if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) score = int((flat_len - i) / engines_len) * weight + 1 duplicated = False for new_res in results: p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break if duplicated: if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] duplicated['score'] += score duplicated['engines'].append(res['engine']) if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] else: res['score'] = score results.append(res) return sorted(results, key=itemgetter('score'), reverse=True) def search(query, request, selected_engines, pageno=1, lang='all'): global engines, categories, number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in selected_engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if pageno > 1 and not engine.paging: continue if lang != 'all' and not engine.language_support: continue request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = pageno request_params['language'] = lang request_params = engine.request(query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']].stats['score_count'] += result['score'] return results, suggestions def get_engines_stats(): # TODO refactor pageloads = [] Loading
searx/search.py +163 −0 Original line number Diff line number Diff line import grequests from itertools import izip_longest, chain from datetime import datetime from operator import itemgetter from urlparse import urlparse, unquote from searx.engines import ( categories, engines, engine_shortcuts ) from searx.languages import language_codes from searx.utils import gen_useragent number_of_searches = 0 def default_request_params(): return { 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}} def make_callback(engine_name, results, suggestions, callback, params): # creating a callback wrapper for the search engine results def process_callback(response, **kwargs): cb_res = [] response.search_params = params engines[engine_name].stats['page_load_time'] += \ (datetime.now() - params['started']).total_seconds() try: search_results = callback(response) except Exception, e: engines[engine_name].stats['errors'] += 1 results[engine_name] = cb_res print '[E] Error with engine "{0}":\n\t{1}'.format( engine_name, str(e)) return for result in search_results: result['engine'] = engine_name if 'suggestion' in result: # TODO type checks suggestions.add(result['suggestion']) continue cb_res.append(result) results[engine_name] = cb_res return process_callback def score_results(results): flat_res = filter( None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) engines_len = len(results) results = [] # deduplication + scoring for i, res in enumerate(flat_res): res['parsed_url'] = urlparse(res['url']) res['host'] = res['parsed_url'].netloc if res['host'].startswith('www.'): res['host'] = res['host'].replace('www.', '', 1) res['engines'] = [res['engine']] weight = 1.0 if hasattr(engines[res['engine']], 'weight'): weight = float(engines[res['engine']].weight) score = int((flat_len - i) / engines_len) * weight + 1 duplicated = False for new_res in results: p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa if res['host'] == new_res['host'] and\ unquote(p1) == unquote(p2) and\ res['parsed_url'].query == new_res['parsed_url'].query and\ res.get('template') == new_res.get('template'): duplicated = new_res break if duplicated: if res.get('content') > duplicated.get('content'): duplicated['content'] = res['content'] duplicated['score'] += score duplicated['engines'].append(res['engine']) if duplicated['parsed_url'].scheme == 'https': continue elif res['parsed_url'].scheme == 'https': duplicated['url'] = res['parsed_url'].geturl() duplicated['parsed_url'] = res['parsed_url'] else: res['score'] = score results.append(res) return sorted(results, key=itemgetter('score'), reverse=True) class Search(object): Loading Loading @@ -112,3 +201,77 @@ class Search(object): if modified: self.query = self.query.replace(query_parts[0], '', 1).strip() self.parse_query() def search(self, request): global number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if self.pageno > 1 and not engine.paging: continue if self.lang != 'all' and not engine.language_support: continue request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = self.pageno request_params['language'] = self.lang request_params = engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] return results, suggestions
searx/tests/test_webapp.py +4 −4 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ class ViewsTestCase(SearxTestCase): self.assertEqual(result.status_code, 200) self.assertIn('<div class="title"><h1>searx</h1></div>', result.data) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_html(self, search): search.return_value = ( self.test_results, Loading @@ -55,7 +55,7 @@ class ViewsTestCase(SearxTestCase): result.data ) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_json(self, search): search.return_value = ( self.test_results, Loading @@ -71,7 +71,7 @@ class ViewsTestCase(SearxTestCase): self.assertEqual( result_dict['results'][0]['url'], 'http://first.test.xyz') @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_csv(self, search): search.return_value = ( self.test_results, Loading @@ -86,7 +86,7 @@ class ViewsTestCase(SearxTestCase): result.data ) @patch('searx.webapp.do_search') @patch('searx.search.Search.search') def test_index_rss(self, search): search.return_value = ( self.test_results, Loading
searx/webapp.py +2 −8 Original line number Diff line number Diff line Loading @@ -39,8 +39,7 @@ from flask import ( from flask.ext.babel import Babel, gettext, format_date from searx import settings, searx_dir from searx.engines import ( search as do_search, categories, engines, get_engines_stats, engine_shortcuts categories, engines, get_engines_stats, engine_shortcuts ) from searx.utils import ( UnicodeWriter, highlight_content, html_to_text, get_themes Loading Loading @@ -191,12 +190,7 @@ def index(): 'index.html', ) # TODO moar refactor - do_search integration into Search class search.results, search.suggestions = do_search(search.query, request, search.engines, search.pageno, search.lang) search.results, search.suggestions = search.search(request) for result in search.results: Loading