utils.py 10.8 KB
Newer Older
Adam Tauber's avatar
Adam Tauber committed
1
import csv
Noémi Ványi's avatar
Noémi Ványi committed
2
3
import hashlib
import hmac
Adam Tauber's avatar
Adam Tauber committed
4
5
6
import os
import re

7
from babel.core import get_global
8
from babel.dates import format_date
9
from babel import UnknownLocaleError
asciimoo's avatar
asciimoo committed
10
from codecs import getincrementalencoder
11
from imp import load_source
12
from numbers import Number
13
from os.path import splitext, join
14
from io import open
Matej Cotman's avatar
Matej Cotman committed
15
from random import choice
16
import sys
17
import json
Matej Cotman's avatar
Matej Cotman committed
18

19
from searx import settings
20
from searx.version import VERSION_STRING
potato's avatar
potato committed
21
from searx.languages import language_codes
22
from searx import settings
Adam Tauber's avatar
Adam Tauber committed
23
from searx import logger
24

Nicolas Gelot's avatar
Nicolas Gelot committed
25
26
27
from io import StringIO
from html.parser import HTMLParser

Adam Tauber's avatar
Adam Tauber committed
28
29

logger = logger.getChild('utils')
asciimoo's avatar
asciimoo committed
30

31
32
33
blocked_tags = ('script',
                'style')

34
35
useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
                             + "/data/useragents.json", 'r', encoding='utf-8').read())
asciimoo's avatar
asciimoo committed
36

37

Thomas Pointhuber's avatar
Thomas Pointhuber committed
38
def searx_useragent():
Cqoicebordel's avatar
Cqoicebordel committed
39
40
    return 'searx/{searx_version} {suffix}'.format(
           searx_version=VERSION_STRING,
41
           suffix=settings['outgoing'].get('useragent_suffix', ''))
Adam Tauber's avatar
Adam Tauber committed
42
43


44
45
def gen_useragent(os=None):
    return str(useragents['ua'].format(os=os or choice(useragents['os']), version=choice(useragents['versions'])))
46
47


asciimoo's avatar
asciimoo committed
48
49
50
51
52
53
54
55
56
57
def highlight_content(content, query):

    if not content:
        return None
    # ignoring html contents
    # TODO better html content detection
    if content.find('<') != -1:
        return content

    if content.lower().find(query.lower()) > -1:
Nicolas Gelot's avatar
Nicolas Gelot committed
58
        query_regex = '({0})'.format(re.escape(query))
Adam Tauber's avatar
Adam Tauber committed
59
60
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)
asciimoo's avatar
asciimoo committed
61
62
63
64
    else:
        regex_parts = []
        for chunk in query.split():
            if len(chunk) == 1:
Nicolas Gelot's avatar
Nicolas Gelot committed
65
                regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
asciimoo's avatar
asciimoo committed
66
            else:
Nicolas Gelot's avatar
Nicolas Gelot committed
67
68
                regex_parts.append('{0}'.format(re.escape(chunk)))
        query_regex = '({0})'.format('|'.join(regex_parts))
Adam Tauber's avatar
Adam Tauber committed
69
70
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)
asciimoo's avatar
asciimoo committed
71
72

    return content
asciimoo's avatar
asciimoo committed
73

74

asciimoo's avatar
asciimoo committed
75
class HTMLTextExtractor(HTMLParser):
76

asciimoo's avatar
asciimoo committed
77
78
    def __init__(self):
        HTMLParser.__init__(self)
79
        self.result = []
80
81
82
83
84
85
        self.tags = []

    def handle_starttag(self, tag, attrs):
        self.tags.append(tag)

    def handle_endtag(self, tag):
86
87
88
        if not self.tags:
            return

89
90
        if tag != self.tags[-1]:
            raise Exception("invalid html")
91

92
93
94
95
        self.tags.pop()

    def is_valid_tag(self):
        return not self.tags or self.tags[-1] not in blocked_tags
asciimoo's avatar
asciimoo committed
96
97

    def handle_data(self, d):
98
99
        if not self.is_valid_tag():
            return
asciimoo's avatar
asciimoo committed
100
101
102
        self.result.append(d)

    def handle_charref(self, number):
103
104
        if not self.is_valid_tag():
            return
Nicolas Gelot's avatar
Nicolas Gelot committed
105
        if number[0] in ('x', 'X'):
asciimoo's avatar
asciimoo committed
106
107
108
            codepoint = int(number[1:], 16)
        else:
            codepoint = int(number)
Nicolas Gelot's avatar
Nicolas Gelot committed
109
        self.result.append(chr(codepoint))
asciimoo's avatar
asciimoo committed
110
111

    def handle_entityref(self, name):
112
113
        if not self.is_valid_tag():
            return
Adam Tauber's avatar
Adam Tauber committed
114
115
        # codepoint = htmlentitydefs.name2codepoint[name]
        # self.result.append(unichr(codepoint))
asciimoo's avatar
asciimoo committed
116
        self.result.append(name)
asciimoo's avatar
asciimoo committed
117
118

    def get_text(self):
Nicolas Gelot's avatar
Nicolas Gelot committed
119
        return ''.join(self.result).strip()
asciimoo's avatar
asciimoo committed
120

121

asciimoo's avatar
asciimoo committed
122
def html_to_text(html):
123
124
    html = html.replace('\n', ' ')
    html = ' '.join(html.split())
asciimoo's avatar
asciimoo committed
125
126
127
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()
asciimoo's avatar
asciimoo committed
128
129
130
131
132
133
134
135
136
137


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
Adam Tauber's avatar
Adam Tauber committed
138
        self.queue = StringIO()
asciimoo's avatar
asciimoo committed
139
140
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
asciimoo's avatar
asciimoo committed
141
        self.encoder = getincrementalencoder(encoding)()
asciimoo's avatar
asciimoo committed
142
143

    def writerow(self, row):
144
        self.writer.writerow(row)
asciimoo's avatar
asciimoo committed
145
        # Fetch UTF-8 output from the queue ...
146
        data = self.queue.getvalue()
Nicolas Gelot's avatar
Nicolas Gelot committed
147
        data = data.strip('\x00')
asciimoo's avatar
asciimoo committed
148
149
150
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
Nicolas Gelot's avatar
Nicolas Gelot committed
151
        self.stream.write(data)
asciimoo's avatar
asciimoo committed
152
153
154
155
156
157
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
Matej Cotman's avatar
Matej Cotman committed
158
159


160
161
162
163
164
165
def get_resources_directory(searx_directory, subdirectory, resources_directory):
    if not resources_directory:
        resources_directory = os.path.join(searx_directory, subdirectory)
    if not os.path.isdir(resources_directory):
        raise Exception(directory + " is not a directory")
    return resources_directory
Matej Cotman's avatar
Matej Cotman committed
166
167


168
def get_themes(templates_path):
169
    """Returns available themes list."""
170
    themes = os.listdir(templates_path)
171
172
    if '__common__' in themes:
        themes.remove('__common__')
173
    return themes
174
175


176
def get_static_files(static_path):
177
    static_files = set()
178
179
    static_path_length = len(static_path) + 1
    for directory, _, files in os.walk(static_path):
180
        for filename in files:
181
            f = os.path.join(directory[static_path_length:], filename)
182
183
            static_files.add(f)
    return static_files
184
185


186
def get_result_templates(templates_path):
187
    result_templates = set()
188
189
    templates_path_length = len(templates_path) + 1
    for directory, _, files in os.walk(templates_path):
190
191
        if directory.endswith('result_templates'):
            for filename in files:
192
                f = os.path.join(directory[templates_path_length:], filename)
193
194
                result_templates.add(f)
    return result_templates
Adam Tauber's avatar
Adam Tauber committed
195
196


197
def format_date_by_locale(date, locale_string):
Adam Tauber's avatar
Adam Tauber committed
198
    # strftime works only on dates after 1900
199
200
201
202
203
204
205

    if date.year <= 1900:
        return date.isoformat().split('T')[0]

    if locale_string == 'all':
        locale_string = settings['ui']['default_locale'] or 'en_US'

206
207
208
    # to avoid crashing if locale is not supported by babel
    try:
        formatted_date = format_date(date, locale=locale_string)
209
    except UnknownLocaleError:
210
211
212
        formatted_date = format_date(date, "YYYY-MM-dd")

    return formatted_date
213
214
215
216
217
218
219
220


def dict_subset(d, properties):
    result = {}
    for k in properties:
        if k in d:
            result[k] = d[k]
    return result
Adam Tauber's avatar
Adam Tauber committed
221
222


Adam Tauber's avatar
Adam Tauber committed
223
224
def prettify_url(url, max_length=74):
    if len(url) > max_length:
Adam Tauber's avatar
Adam Tauber committed
225
        chunk_len = int(max_length / 2 + 1)
Nicolas Gelot's avatar
Nicolas Gelot committed
226
        return '{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:])
Adam Tauber's avatar
Adam Tauber committed
227
228
    else:
        return url
229
230


231
232
233
234
235
236
# get element in list or default value
def list_get(a_list, index, default=None):
    if len(a_list) > index:
        return a_list[index]
    else:
        return default
Noemi Vanyi's avatar
Noemi Vanyi committed
237
238
239
240
241
242
243
244
245
246
247
248
249
250


def get_torrent_size(filesize, filesize_multiplier):
    try:
        filesize = float(filesize)

        if filesize_multiplier == 'TB':
            filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'GB':
            filesize = int(filesize * 1024 * 1024 * 1024)
        elif filesize_multiplier == 'MB':
            filesize = int(filesize * 1024 * 1024)
        elif filesize_multiplier == 'KB':
            filesize = int(filesize * 1024)
Noémi Ványi's avatar
Noémi Ványi committed
251
252
253
254
255
256
257
258
        elif filesize_multiplier == 'TiB':
            filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
        elif filesize_multiplier == 'GiB':
            filesize = int(filesize * 1000 * 1000 * 1000)
        elif filesize_multiplier == 'MiB':
            filesize = int(filesize * 1000 * 1000)
        elif filesize_multiplier == 'KiB':
            filesize = int(filesize * 1000)
259
    except ValueError:
Noemi Vanyi's avatar
Noemi Vanyi committed
260
261
262
        filesize = None

    return filesize
potato's avatar
potato committed
263
264


Noémi Ványi's avatar
Noémi Ványi committed
265
266
267
268
269
270
271
def convert_str_to_int(number_str):
    if number_str.isdigit():
        return int(number_str)
    else:
        return 0


272
273
274
275
276
277
278
279
280
# convert a variable to integer or return 0 if it's not a number
def int_or_zero(num):
    if isinstance(num, list):
        if len(num) < 1:
            return 0
        num = num[0]
    return convert_str_to_int(num)


potato's avatar
potato committed
281
282
283
284
285
def is_valid_lang(lang):
    is_abbr = (len(lang) == 2)
    if is_abbr:
        for l in language_codes:
            if l[0][:2] == lang.lower():
286
                return (True, l[0][:2], l[3].lower())
potato's avatar
potato committed
287
288
289
290
        return False
    else:
        for l in language_codes:
            if l[1].lower() == lang.lower():
291
                return (True, l[0][:2], l[3].lower())
potato's avatar
potato committed
292
        return False
293
294


295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
# auxiliary function to match lang_code in lang_list
def _match_language(lang_code, lang_list=[], custom_aliases={}):
    # replace language code with a custom alias if necessary
    if lang_code in custom_aliases:
        lang_code = custom_aliases[lang_code]

    if lang_code in lang_list:
        return lang_code

    # try to get the most likely country for this language
    subtags = get_global('likely_subtags').get(lang_code)
    if subtags:
        subtag_parts = subtags.split('_')
        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
        if new_code in custom_aliases:
            new_code = custom_aliases[new_code]
        if new_code in lang_list:
            return new_code

    # try to get the any supported country for this language
    for lc in lang_list:
        if lang_code == lc.split('-')[0]:
            return lc

    return None


# get the language code from lang_list that best matches locale_code
def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
    # try to get language from given locale_code
    language = _match_language(locale_code, lang_list, custom_aliases)
    if language:
        return language

    locale_parts = locale_code.split('-')
    lang_code = locale_parts[0]

    # try to get language using an equivalent country code
    if len(locale_parts) > 1:
        country_alias = get_global('territory_aliases').get(locale_parts[-1])
        if country_alias:
            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
            if language:
                return language

    # try to get language using an equivalent language code
    alias = get_global('language_aliases').get(lang_code)
    if alias:
        language = _match_language(alias, lang_list, custom_aliases)
        if language:
            return language

    if lang_code != locale_code:
        # try to get language from given language without giving the country
        language = _match_language(lang_code, lang_list, custom_aliases)

    return language or fallback


354
355
356
357
358
359
360
361
def load_module(filename, module_dir):
    modname = splitext(filename)[0]
    if modname in sys.modules:
        del sys.modules[modname]
    filepath = join(module_dir, filename)
    module = load_source(modname, filepath)
    module.name = modname
    return module
Noémi Ványi's avatar
Noémi Ványi committed
362
363
364


def new_hmac(secret_key, url):
Nicolas Gelot's avatar
Nicolas Gelot committed
365
    return hmac.new(bytes(secret_key, 'utf-8'), url, hashlib.sha256).hexdigest()
366
367
368


def to_string(obj):
Nicolas Gelot's avatar
Nicolas Gelot committed
369
    if isinstance(obj, str):
370
371
        return obj
    if isinstance(obj, Number):
Nicolas Gelot's avatar
Nicolas Gelot committed
372
        return str(obj)
373
374
375
376
    if hasattr(obj, '__str__'):
        return obj.__str__()
    if hasattr(obj, '__repr__'):
        return obj.__repr__()