bing_images.py 3.54 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
"""
 Bing (Images)

 @website     https://www.bing.com/images
 @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
              max. 5000 query/month

 @using-api   no (because of query limit)
 @results     HTML (using search portal)
 @stable      no (HTML can change)
 @parse       url, title, img_src

 @todo        currently there are up to 35 images receive per page,
              because bing does not parse count=10.
              limited response to 10 images
"""
Thomas Pointhuber's avatar
Thomas Pointhuber committed
17
18

from lxml import html
Adam Tauber's avatar
Adam Tauber committed
19
from json import loads
Thomas Pointhuber's avatar
Thomas Pointhuber committed
20
import re
Adam Tauber's avatar
Adam Tauber committed
21
from searx.url_utils import urlencode
22
from searx.utils import match_language
Thomas Pointhuber's avatar
Thomas Pointhuber committed
23
24
25
26

# engine dependent config
categories = ['images']
paging = True
27
safesearch = True
28
time_range_support = True
29
30
language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
Thomas Pointhuber's avatar
Thomas Pointhuber committed
31
32
33
34

# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
35
36
37
time_range_string = '&qft=+filterui:age-lt{interval}'
time_range_dict = {'day': '1440',
                   'week': '10080',
38
39
                   'month': '43200',
                   'year': '525600'}
Thomas Pointhuber's avatar
Thomas Pointhuber committed
40

41
42
43
44
# safesearch definitions
safesearch_types = {2: 'STRICT',
                    1: 'DEMOTE',
                    0: 'OFF'}
Thomas Pointhuber's avatar
Thomas Pointhuber committed
45

Thomas Pointhuber's avatar
Thomas Pointhuber committed
46

Adam Tauber's avatar
Adam Tauber committed
47
48
49
_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)


Thomas Pointhuber's avatar
Thomas Pointhuber committed
50
51
52
53
54
55
56
57
# do search-request
def request(query, params):
    offset = (params['pageno'] - 1) * 10 + 1

    search_path = search_string.format(
        query=urlencode({'q': query}),
        offset=offset)

58
    language = match_language(params['language'], supported_languages, language_aliases).lower()
59

Thomas Pointhuber's avatar
Thomas Pointhuber committed
60
    params['cookies']['SRCHHPGUSR'] = \
61
62
63
64
        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

    params['cookies']['_EDGE_S'] = 'mkt=' + language +\
        '&ui=' + language + '&F=1'
Thomas Pointhuber's avatar
Thomas Pointhuber committed
65
66

    params['url'] = base_url + search_path
67
68
    if params['time_range'] in time_range_dict:
        params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
Thomas Pointhuber's avatar
Thomas Pointhuber committed
69
70
71
72
73
74
75
76

    return params


# get response from search-request
def response(resp):
    results = []

77
    dom = html.fromstring(resp.text)
Thomas Pointhuber's avatar
Thomas Pointhuber committed
78
79

    # parse results
marc's avatar
marc committed
80
    for result in dom.xpath('//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'):
Thomas Pointhuber's avatar
Thomas Pointhuber committed
81
82
        link = result.xpath('./a')[0]

marc's avatar
marc committed
83
84
85
        # TODO find actual title
        title = link.xpath('.//img/@alt')[0]

Adam Tauber's avatar
Adam Tauber committed
86
87
        # parse json-data (it is required to add a space, to make it parsable)
        json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m')))
88

marc's avatar
marc committed
89
90
        url = json_data.get('purl')
        img_src = json_data.get('murl')
91
        thumbnail = json_data.get('turl')
Thomas Pointhuber's avatar
Thomas Pointhuber committed
92
93
94
95
96

        # append result
        results.append({'template': 'images.html',
                        'url': url,
                        'title': title,
97
                        'content': '',
marc's avatar
marc committed
98
                        'thumbnail_src': thumbnail,
Thomas Pointhuber's avatar
Thomas Pointhuber committed
99
100
101
                        'img_src': img_src})

        # TODO stop parsing if 10 images are found
marc's avatar
marc committed
102
103
        # if len(results) >= 10:
        #     break
Thomas Pointhuber's avatar
Thomas Pointhuber committed
104
105
106

    # return results
    return results
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125


# get supported languages from their site
def _fetch_supported_languages(resp):
    supported_languages = []
    dom = html.fromstring(resp.text)

    regions_xpath = '//div[@id="region-section-content"]' \
                    + '//ul[@class="b_vList"]/li/a/@href'

    regions = dom.xpath(regions_xpath)
    for region in regions:
        code = re.search('setmkt=[^\&]+', region).group()[7:]
        if code == 'nb-NO':
            code = 'no-NO'

        supported_languages.append(code)

    return supported_languages