Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b0ba367a authored by Adam Tauber's avatar Adam Tauber
Browse files

[enh][mod] search refactor

parent a07b2b51
Loading
Loading
Loading
Loading
+1 −162
Original line number Diff line number Diff line
@@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
from os.path import realpath, dirname, splitext, join
import sys
from imp import load_source
from itertools import izip_longest, chain
from operator import itemgetter
from urlparse import urlparse, unquote
from datetime import datetime
import grequests
from flask.ext.babel import gettext
from operator import itemgetter
from searx import settings
from searx.utils import gen_useragent

engine_dir = dirname(realpath(__file__))

number_of_searches = 0

engines = {}

categories = {'general': []}
@@ -114,160 +107,6 @@ for engine_data in settings['engines']:
        engine_shortcuts[engine.shortcut] = engine.name


def default_request_params():
    return {
        'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}


def make_callback(engine_name, results, suggestions, callback, params):
    # creating a callback wrapper for the search engine results
    def process_callback(response, **kwargs):
        cb_res = []
        response.search_params = params
        engines[engine_name].stats['page_load_time'] += \
            (datetime.now() - params['started']).total_seconds()
        try:
            search_results = callback(response)
        except Exception, e:
            engines[engine_name].stats['errors'] += 1
            results[engine_name] = cb_res
            print '[E] Error with engine "{0}":\n\t{1}'.format(
                engine_name, str(e))
            return
        for result in search_results:
            result['engine'] = engine_name
            if 'suggestion' in result:
                # TODO type checks
                suggestions.add(result['suggestion'])
                continue
            cb_res.append(result)
        results[engine_name] = cb_res
    return process_callback


def score_results(results):
    flat_res = filter(
        None, chain.from_iterable(izip_longest(*results.values())))
    flat_len = len(flat_res)
    engines_len = len(results)
    results = []
    # deduplication + scoring
    for i, res in enumerate(flat_res):

        res['parsed_url'] = urlparse(res['url'])

        res['host'] = res['parsed_url'].netloc

        if res['host'].startswith('www.'):
            res['host'] = res['host'].replace('www.', '', 1)

        res['engines'] = [res['engine']]
        weight = 1.0

        if hasattr(engines[res['engine']], 'weight'):
            weight = float(engines[res['engine']].weight)

        score = int((flat_len - i) / engines_len) * weight + 1
        duplicated = False

        for new_res in results:
            p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
            p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
            if res['host'] == new_res['host'] and\
               unquote(p1) == unquote(p2) and\
               res['parsed_url'].query == new_res['parsed_url'].query and\
               res.get('template') == new_res.get('template'):
                duplicated = new_res
                break
        if duplicated:
            if res.get('content') > duplicated.get('content'):
                duplicated['content'] = res['content']
            duplicated['score'] += score
            duplicated['engines'].append(res['engine'])
            if duplicated['parsed_url'].scheme == 'https':
                continue
            elif res['parsed_url'].scheme == 'https':
                duplicated['url'] = res['parsed_url'].geturl()
                duplicated['parsed_url'] = res['parsed_url']
        else:
            res['score'] = score
            results.append(res)
    return sorted(results, key=itemgetter('score'), reverse=True)


def search(query, request, selected_engines, pageno=1, lang='all'):
    global engines, categories, number_of_searches
    requests = []
    results = {}
    suggestions = set()
    number_of_searches += 1
    #user_agent = request.headers.get('User-Agent', '')
    user_agent = gen_useragent()

    for selected_engine in selected_engines:
        if selected_engine['name'] not in engines:
            continue

        engine = engines[selected_engine['name']]

        if pageno > 1 and not engine.paging:
            continue

        if lang != 'all' and not engine.language_support:
            continue

        request_params = default_request_params()
        request_params['headers']['User-Agent'] = user_agent
        request_params['category'] = selected_engine['category']
        request_params['started'] = datetime.now()
        request_params['pageno'] = pageno
        request_params['language'] = lang
        request_params = engine.request(query.encode('utf-8'), request_params)

        if request_params['url'] is None:
            # TODO add support of offline engines
            pass

        callback = make_callback(
            selected_engine['name'],
            results,
            suggestions,
            engine.response,
            request_params
        )

        request_args = dict(
            headers=request_params['headers'],
            hooks=dict(response=callback),
            cookies=request_params['cookies'],
            timeout=engine.timeout
        )

        if request_params['method'] == 'GET':
            req = grequests.get
        else:
            req = grequests.post
            request_args['data'] = request_params['data']

        # ignoring empty urls
        if not request_params['url']:
            continue

        requests.append(req(request_params['url'], **request_args))
    grequests.map(requests)
    for engine_name, engine_results in results.items():
        engines[engine_name].stats['search_count'] += 1
        engines[engine_name].stats['result_count'] += len(engine_results)

    results = score_results(results)

    for result in results:
        for res_engine in result['engines']:
            engines[result['engine']].stats['score_count'] += result['score']

    return results, suggestions


def get_engines_stats():
    # TODO refactor
    pageloads = []
+163 −0
Original line number Diff line number Diff line
import grequests
from itertools import izip_longest, chain
from datetime import datetime
from operator import itemgetter
from urlparse import urlparse, unquote
from searx.engines import (
    categories, engines, engine_shortcuts
)
from searx.languages import language_codes
from searx.utils import gen_useragent

number_of_searches = 0


def default_request_params():
    return {
        'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}


def make_callback(engine_name, results, suggestions, callback, params):
    # creating a callback wrapper for the search engine results
    def process_callback(response, **kwargs):
        cb_res = []
        response.search_params = params
        engines[engine_name].stats['page_load_time'] += \
            (datetime.now() - params['started']).total_seconds()
        try:
            search_results = callback(response)
        except Exception, e:
            engines[engine_name].stats['errors'] += 1
            results[engine_name] = cb_res
            print '[E] Error with engine "{0}":\n\t{1}'.format(
                engine_name, str(e))
            return
        for result in search_results:
            result['engine'] = engine_name
            if 'suggestion' in result:
                # TODO type checks
                suggestions.add(result['suggestion'])
                continue
            cb_res.append(result)
        results[engine_name] = cb_res
    return process_callback


def score_results(results):
    flat_res = filter(
        None, chain.from_iterable(izip_longest(*results.values())))
    flat_len = len(flat_res)
    engines_len = len(results)
    results = []
    # deduplication + scoring
    for i, res in enumerate(flat_res):

        res['parsed_url'] = urlparse(res['url'])

        res['host'] = res['parsed_url'].netloc

        if res['host'].startswith('www.'):
            res['host'] = res['host'].replace('www.', '', 1)

        res['engines'] = [res['engine']]
        weight = 1.0

        if hasattr(engines[res['engine']], 'weight'):
            weight = float(engines[res['engine']].weight)

        score = int((flat_len - i) / engines_len) * weight + 1
        duplicated = False

        for new_res in results:
            p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
            p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
            if res['host'] == new_res['host'] and\
               unquote(p1) == unquote(p2) and\
               res['parsed_url'].query == new_res['parsed_url'].query and\
               res.get('template') == new_res.get('template'):
                duplicated = new_res
                break
        if duplicated:
            if res.get('content') > duplicated.get('content'):
                duplicated['content'] = res['content']
            duplicated['score'] += score
            duplicated['engines'].append(res['engine'])
            if duplicated['parsed_url'].scheme == 'https':
                continue
            elif res['parsed_url'].scheme == 'https':
                duplicated['url'] = res['parsed_url'].geturl()
                duplicated['parsed_url'] = res['parsed_url']
        else:
            res['score'] = score
            results.append(res)
    return sorted(results, key=itemgetter('score'), reverse=True)


class Search(object):
@@ -112,3 +201,77 @@ class Search(object):
        if modified:
            self.query = self.query.replace(query_parts[0], '', 1).strip()
            self.parse_query()

    def search(self, request):
        global number_of_searches
        requests = []
        results = {}
        suggestions = set()
        number_of_searches += 1
        #user_agent = request.headers.get('User-Agent', '')
        user_agent = gen_useragent()

        for selected_engine in self.engines:
            if selected_engine['name'] not in engines:
                continue

            engine = engines[selected_engine['name']]

            if self.pageno > 1 and not engine.paging:
                continue

            if self.lang != 'all' and not engine.language_support:
                continue

            request_params = default_request_params()
            request_params['headers']['User-Agent'] = user_agent
            request_params['category'] = selected_engine['category']
            request_params['started'] = datetime.now()
            request_params['pageno'] = self.pageno
            request_params['language'] = self.lang
            request_params = engine.request(self.query.encode('utf-8'),
                                            request_params)

            if request_params['url'] is None:
                # TODO add support of offline engines
                pass

            callback = make_callback(
                selected_engine['name'],
                results,
                suggestions,
                engine.response,
                request_params
            )

            request_args = dict(
                headers=request_params['headers'],
                hooks=dict(response=callback),
                cookies=request_params['cookies'],
                timeout=engine.timeout
            )

            if request_params['method'] == 'GET':
                req = grequests.get
            else:
                req = grequests.post
                request_args['data'] = request_params['data']

            # ignoring empty urls
            if not request_params['url']:
                continue

            requests.append(req(request_params['url'], **request_args))
        grequests.map(requests)
        for engine_name, engine_results in results.items():
            engines[engine_name].stats['search_count'] += 1
            engines[engine_name].stats['result_count'] += len(engine_results)

        results = score_results(results)

        for result in results:
            for res_engine in result['engines']:
                engines[result['engine']]\
                    .stats['score_count'] += result['score']

        return results, suggestions
+4 −4
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ class ViewsTestCase(SearxTestCase):
        self.assertEqual(result.status_code, 200)
        self.assertIn('<div class="title"><h1>searx</h1></div>', result.data)

    @patch('searx.webapp.do_search')
    @patch('searx.search.Search.search')
    def test_index_html(self, search):
        search.return_value = (
            self.test_results,
@@ -55,7 +55,7 @@ class ViewsTestCase(SearxTestCase):
            result.data
        )

    @patch('searx.webapp.do_search')
    @patch('searx.search.Search.search')
    def test_index_json(self, search):
        search.return_value = (
            self.test_results,
@@ -71,7 +71,7 @@ class ViewsTestCase(SearxTestCase):
        self.assertEqual(
            result_dict['results'][0]['url'], 'http://first.test.xyz')

    @patch('searx.webapp.do_search')
    @patch('searx.search.Search.search')
    def test_index_csv(self, search):
        search.return_value = (
            self.test_results,
@@ -86,7 +86,7 @@ class ViewsTestCase(SearxTestCase):
            result.data
        )

    @patch('searx.webapp.do_search')
    @patch('searx.search.Search.search')
    def test_index_rss(self, search):
        search.return_value = (
            self.test_results,
+2 −8
Original line number Diff line number Diff line
@@ -39,8 +39,7 @@ from flask import (
from flask.ext.babel import Babel, gettext, format_date
from searx import settings, searx_dir
from searx.engines import (
    search as do_search, categories, engines, get_engines_stats,
    engine_shortcuts
    categories, engines, get_engines_stats, engine_shortcuts
)
from searx.utils import (
    UnicodeWriter, highlight_content, html_to_text, get_themes
@@ -191,12 +190,7 @@ def index():
            'index.html',
        )

    # TODO moar refactor - do_search integration into Search class
    search.results, search.suggestions = do_search(search.query,
                                                   request,
                                                   search.engines,
                                                   search.pageno,
                                                   search.lang)
    search.results, search.suggestions = search.search(request)

    for result in search.results: