results.py 11.1 KB
Newer Older
1
2
3
4
5
import re
from collections import defaultdict
from operator import itemgetter
from threading import RLock
from searx.engines import engines
Adam Tauber's avatar
Adam Tauber committed
6
7
from searx.url_utils import urlparse, unquote

8

stepshal's avatar
stepshal committed
9
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
10
11
12
13
14
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)


# return the meaningful length of the content for a result
def result_content_len(content):
Nicolas Gelot's avatar
Nicolas Gelot committed
15
    if isinstance(content, str):
16
17
18
19
20
21
        return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
    else:
        return 0


def compare_urls(url_a, url_b):
marc's avatar
marc committed
22
23
24
25
26
27
28
29
30
31
    # ignore www. in comparison
    if url_a.netloc.startswith('www.'):
        host_a = url_a.netloc.replace('www.', '', 1)
    else:
        host_a = url_a.netloc
    if url_b.netloc.startswith('www.'):
        host_b = url_b.netloc.replace('www.', '', 1)
    else:
        host_b = url_b.netloc

32
    if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
33
34
35
36
37
38
39
40
41
42
43
44
45
46
        return False

    # remove / from the end of the url if required
    path_a = url_a.path[:-1]\
        if url_a.path.endswith('/')\
        else url_a.path
    path_b = url_b.path[:-1]\
        if url_b.path.endswith('/')\
        else url_b.path

    return unquote(path_a) == unquote(path_b)


def merge_two_infoboxes(infobox1, infobox2):
marc's avatar
marc committed
47
48
49
50
51
52
53
54
55
56
57
58
59
    # get engines weights
    if hasattr(engines[infobox1['engine']], 'weight'):
        weight1 = engines[infobox1['engine']].weight
    else:
        weight1 = 1
    if hasattr(engines[infobox2['engine']], 'weight'):
        weight2 = engines[infobox2['engine']].weight
    else:
        weight2 = 1

    if weight2 > weight1:
        infobox1['engine'] = infobox2['engine']

60
61
62
63
64
    if 'urls' in infobox2:
        urls1 = infobox1.get('urls', None)
        if urls1 is None:
            urls1 = []

marc's avatar
marc committed
65
66
67
68
69
70
71
72
        for url2 in infobox2.get('urls', []):
            unique_url = True
            for url1 in infobox1.get('urls', []):
                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
                    unique_url = False
                    break
            if unique_url:
                urls1.append(url2)
73

marc's avatar
marc committed
74
        infobox1['urls'] = urls1
75

a01200356's avatar
a01200356 committed
76
77
78
79
80
    if 'img_src' in infobox2:
        img1 = infobox1.get('img_src', None)
        img2 = infobox2.get('img_src')
        if img1 is None:
            infobox1['img_src'] = img2
marc's avatar
marc committed
81
82
        elif weight2 > weight1:
            infobox1['img_src'] = img2
a01200356's avatar
a01200356 committed
83

84
85
86
87
    if 'attributes' in infobox2:
        attributes1 = infobox1.get('attributes', None)
        if attributes1 is None:
            attributes1 = []
a01200356's avatar
a01200356 committed
88
            infobox1['attributes'] = attributes1
89
90
91
92
93
94
95

        attributeSet = set()
        for attribute in infobox1.get('attributes', []):
            if attribute.get('label', None) not in attributeSet:
                attributeSet.add(attribute.get('label', None))

        for attribute in infobox2.get('attributes', []):
marc's avatar
marc committed
96
97
            if attribute.get('label', None) not in attributeSet:
                attributes1.append(attribute)
98
99
100
101
102
103
104
105

    if 'content' in infobox2:
        content1 = infobox1.get('content', None)
        content2 = infobox2.get('content', '')
        if content1 is not None:
            if result_content_len(content2) > result_content_len(content1):
                infobox1['content'] = content2
        else:
a01200356's avatar
a01200356 committed
106
            infobox1['content'] = content2
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


def result_score(result):
    weight = 1.0

    for result_engine in result['engines']:
        if hasattr(engines[result_engine], 'weight'):
            weight *= float(engines[result_engine].weight)

    occurences = len(result['positions'])

    return sum((occurences * weight) / position for position in result['positions'])


class ResultContainer(object):
    """docstring for ResultContainer"""
123

124
125
126
127
128
129
130
    def __init__(self):
        super(ResultContainer, self).__init__()
        self.results = defaultdict(list)
        self._merged_results = []
        self.infoboxes = []
        self.suggestions = set()
        self.answers = set()
131
        self.corrections = set()
132
        self._number_of_results = []
dalf's avatar
dalf committed
133
134
        self._ordered = False
        self.paging = False
135
        self.unresponsive_engines = set()
136
137
138

    def extend(self, engine_name, results):
        for result in list(results):
139
            result['engine'] = engine_name
140
141
142
143
            if 'suggestion' in result:
                self.suggestions.add(result['suggestion'])
                results.remove(result)
            elif 'answer' in result:
Adam Tauber's avatar
Adam Tauber committed
144
                self.answers.add(result['answer'])
Adam Tauber's avatar
Adam Tauber committed
145
                results.remove(result)
146
147
148
            elif 'correction' in result:
                self.corrections.add(result['correction'])
                results.remove(result)
149
150
151
            elif 'infobox' in result:
                self._merge_infobox(result)
                results.remove(result)
Adam Tauber's avatar
Adam Tauber committed
152
            elif 'number_of_results' in result:
153
                self._number_of_results.append(result['number_of_results'])
Adam Tauber's avatar
Adam Tauber committed
154
                results.remove(result)
155

156
157
158
159
        if engine_name in engines:
            with RLock():
                engines[engine_name].stats['search_count'] += 1
                engines[engine_name].stats['result_count'] += len(results)
160
161
162
163
164
165

        if not results:
            return

        self.results[engine_name].extend(results)

166
        if not self.paging and engine_name in engines and engines[engine_name].paging:
dalf's avatar
dalf committed
167
168
            self.paging = True

169
        for i, result in enumerate(results):
Nicolas Gelot's avatar
Nicolas Gelot committed
170
            if 'url' in result and not isinstance(result['url'], str):
171
                continue
172
            try:
Nicolas Gelot's avatar
Nicolas Gelot committed
173
                result['url'] = result['url']
174
            except KeyError:
175
                pass
Nicolas Gelot's avatar
Nicolas Gelot committed
176
            if 'title' in result and not isinstance(result['title'], str):
177
                continue
Nicolas Gelot's avatar
Nicolas Gelot committed
178
            if 'content' in result and not isinstance(result['content'], str):
179
                continue
180
181
182
183
184
185
186
            position = i + 1
            self._merge_result(result, position)

    def _merge_infobox(self, infobox):
        add_infobox = True
        infobox_id = infobox.get('id', None)
        if infobox_id is not None:
marc's avatar
marc committed
187
188
189
190
            for existingIndex in self.infoboxes:
                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
                    merge_two_infoboxes(existingIndex, infobox)
                    add_infobox = False
191
192
193
194
195
196
197
198
199
200

        if add_infobox:
            self.infoboxes.append(infobox)

    def _merge_result(self, result, position):
        result['parsed_url'] = urlparse(result['url'])

        # if the result has no scheme, use http as default
        if not result['parsed_url'].scheme:
            result['parsed_url'] = result['parsed_url']._replace(scheme="http")
Luc Didry's avatar
Luc Didry committed
201
            result['url'] = result['parsed_url'].geturl()
202

203
        result['engines'] = [result['engine']]
204
205
206
207
208
209

        # strip multiple spaces and cariage returns from content
        if result.get('content'):
            result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])

        # check for duplicates
210
        duplicated = None
211
212
213
214
215
216
217
218
219
220
221
222
223
        for merged_result in self._merged_results:
            if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
               and result.get('template') == merged_result.get('template'):
                duplicated = merged_result
                break

        # merge duplicates together
        if duplicated:
            # using content with more text
            if result_content_len(result.get('content', '')) >\
                    result_content_len(duplicated.get('content', '')):
                duplicated['content'] = result['content']

224
225
226
227
228
            # merge all result's parameters not found in duplicate
            for key in result.keys():
                if not duplicated.get(key):
                    duplicated[key] = result.get(key)

229
230
231
232
            # add the new position
            duplicated['positions'].append(position)

            # add engine to list of result-engines
233
234
            if result['engine'] not in duplicated['engines']:
                duplicated['engines'].append(result['engine'])
235
236
237
238
239
240
241
242
243
244
245
246

            # using https if possible
            if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
                duplicated['url'] = result['parsed_url'].geturl()
                duplicated['parsed_url'] = result['parsed_url']

        # if there is no duplicate found, append result
        else:
            result['positions'] = [position]
            with RLock():
                self._merged_results.append(result)

dalf's avatar
dalf committed
247
    def order_results(self):
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
        for result in self._merged_results:
            score = result_score(result)
            result['score'] = score
            with RLock():
                for result_engine in result['engines']:
                    engines[result_engine].stats['score_count'] += score

        results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)

        # pass 2 : group results by category and template
        gresults = []
        categoryPositions = {}

        for i, res in enumerate(results):
            # FIXME : handle more than one category per engine
263
264
265
266
267
268
            res['category'] = engines[res['engine']].categories[0]

            # FIXME : handle more than one category per engine
            category = engines[res['engine']].categories[0]\
                + ':' + res.get('template', '')\
                + ':' + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

            current = None if category not in categoryPositions\
                else categoryPositions[category]

            # group with previous results using the same category
            # if the group can accept more result and is not too far
            # from the current position
            if current is not None and (current['count'] > 0)\
                    and (len(gresults) - current['index'] < 20):
                # group with the previous results using
                # the same category with this one
                index = current['index']
                gresults.insert(index, res)

                # update every index after the current one
                # (including the current one)
                for k in categoryPositions:
                    v = categoryPositions[k]['index']
                    if v >= index:
                        categoryPositions[k]['index'] = v + 1

                # update this category
                current['count'] -= 1

            else:
                # same category
                gresults.append(res)

                # update categoryIndex
                categoryPositions[category] = {'index': len(gresults), 'count': 8}

dalf's avatar
dalf committed
300
301
302
303
304
305
306
307
        # update _merged_results
        self._ordered = True
        self._merged_results = gresults

    def get_ordered_results(self):
        if not self._ordered:
            self.order_results()
        return self._merged_results
308
309
310

    def results_length(self):
        return len(self._merged_results)
311
312
313
314
315

    def results_number(self):
        resultnum_sum = sum(self._number_of_results)
        if not resultnum_sum or not self._number_of_results:
            return 0
Nicolas Gelot's avatar
Nicolas Gelot committed
316
        return resultnum_sum // len(self._number_of_results)
317

318
319
    def add_unresponsive_engine(self, engine_error):
        self.unresponsive_engines.add(engine_error)