google_videos.py 2.81 KB
Newer Older
marc's avatar
marc committed
1
2
3
4
5
6
7
8
9
"""
 Google (Videos)

 @website     https://www.google.com
 @provide-api yes (https://developers.google.com/custom-search/)

 @using-api   no
 @results     HTML
 @stable      no
Venca24's avatar
Venca24 committed
10
 @parse       url, title, content, thumbnail
marc's avatar
marc committed
11
12
13
14
15
16
17
"""

from datetime import date, timedelta
from json import loads
from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
Venca24's avatar
Venca24 committed
18
import re
marc's avatar
marc committed
19
20
21
22
23
24
25
26
27

# engine dependent config
categories = ['videos']
paging = True
safesearch = True
time_range_support = True
number_of_results = 10

search_url = 'https://www.google.com/search'\
Venca24's avatar
Venca24 committed
28
    '?q={query}'\
marc's avatar
marc committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    '&tbm=vid'\
    '&{search_options}'
time_range_attr = "qdr:{range}"
time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
time_range_dict = {'day': 'd',
                   'week': 'w',
                   'month': 'm'}


# do search-request
def request(query, params):
    search_options = {
        'ijn': params['pageno'] - 1,
        'start': (params['pageno'] - 1) * number_of_results
    }

    if params['time_range'] in time_range_dict:
        search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
    elif params['time_range'] == 'year':
        now = date.today()
        then = now - timedelta(days=365)
        start = then.strftime('%m/%d/%Y')
        end = now.strftime('%m/%d/%Y')
        search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)

    if safesearch and params['safesearch']:
        search_options['safe'] = 'on'

    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      search_options=urlencode(search_options))

    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('//div[@class="g"]'):

Venca24's avatar
Venca24 committed
72
73
        title = extract_text(result.xpath('.//h3'))
        url = result.xpath('.//div[@class="r"]/a/@href')[0]
marc's avatar
marc committed
74
75
        content = extract_text(result.xpath('.//span[@class="st"]'))

Venca24's avatar
Venca24 committed
76
77
78
        # get thumbnails
        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
        id = result.xpath('.//div[@class="s"]//img/@id')[0]
79
        thumbnails_data = re.findall(r's=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
Venca24's avatar
Venca24 committed
80
81
82
                                     script)
        tmp = []
        if len(thumbnails_data) != 0:
83
            tmp = re.findall(r'(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
Venca24's avatar
Venca24 committed
84
85
86
87
        thumbnail = ''
        if len(tmp) != 0:
            thumbnail = tmp[-1]

marc's avatar
marc committed
88
89
90
91
        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
Venca24's avatar
Venca24 committed
92
                        'thumbnail': thumbnail,
marc's avatar
marc committed
93
94
95
                        'template': 'videos.html'})

    return results