duckduckgo.py 3.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
"""
 DuckDuckGo (Web)

 @website     https://duckduckgo.com/
 @provide-api yes (https://duckduckgo.com/api),
              but not all results from search-site

 @using-api   no
 @results     HTML (using search portal)
 @stable      no (HTML can change)
 @parse       url, title, content

 @todo        rewrite to api
"""
15

Adam Tauber's avatar
Adam Tauber committed
16
from lxml.html import fromstring
17
from json import loads
Cqoicebordel's avatar
Cqoicebordel committed
18
from searx.engines.xpath import extract_text
19
from searx.poolrequests import get
Adam Tauber's avatar
Adam Tauber committed
20
from searx.url_utils import urlencode
21
from searx.utils import match_language
asciimoo's avatar
asciimoo committed
22

23
24
25
# engine dependent config
categories = ['general']
paging = True
26
language_support = True
27
supported_languages_url = 'https://duckduckgo.com/util/u172.js'
28
time_range_support = True
asciimoo's avatar
asciimoo committed
29

30
31
32
33
34
35
36
37
38
39
language_aliases = {
    'ar-SA': 'ar-XA',
    'es-419': 'es-XL',
    'ja': 'jp-JP',
    'ko': 'kr-KR',
    'sl-SI': 'sl-SL',
    'zh-TW': 'tzh-TW',
    'zh-HK': 'tzh-HK'
}

40
# search-url
41
url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
42
43
44
45
46
time_range_url = '&df={range}'

time_range_dict = {'day': 'd',
                   'week': 'w',
                   'month': 'm'}
47
48

# specific xpath variables
a01200356's avatar
a01200356 committed
49
50
51
52
result_xpath = '//div[@class="result results_links results_links_deep web-result "]'  # noqa
url_xpath = './/a[@class="result__a"]/@href'
title_xpath = './/a[@class="result__a"]'
content_xpath = './/a[@class="result__snippet"]'
Adam Tauber's avatar
Adam Tauber committed
53

54

marc's avatar
marc committed
55
# match query's language to a region code that duckduckgo will accept
56
def get_region_code(lang, lang_list=[]):
57
58
59
    if lang == 'all':
        return None

60
61
62
63
64
    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
    lang_parts = lang_code.split('-')

    # country code goes first
    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
marc's avatar
marc committed
65
66
67
68
69
70


def request(query, params):
    if params['time_range'] and params['time_range'] not in time_range_dict:
        return params

marc's avatar
marc committed
71
    offset = (params['pageno'] - 1) * 30
a01200356's avatar
a01200356 committed
72

73
    region_code = get_region_code(params['language'], supported_languages)
74
75
76
77
78
79
    if region_code:
        params['url'] = url.format(
            query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
    else:
        params['url'] = url.format(
            query=urlencode({'q': query}), offset=offset, dc_param=offset)
80

Adam Tauber's avatar
Adam Tauber committed
81
    if params['time_range'] in time_range_dict:
82
83
        params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])

asciimoo's avatar
asciimoo committed
84
85
86
    return params


87
# get response from search-request
asciimoo's avatar
asciimoo committed
88
def response(resp):
asciimoo's avatar
asciimoo committed
89
    results = []
Adam Tauber's avatar
Adam Tauber committed
90
91
92

    doc = fromstring(resp.text)

Thomas Pointhuber's avatar
Thomas Pointhuber committed
93
    # parse results
Adam Tauber's avatar
Adam Tauber committed
94
    for r in doc.xpath(result_xpath):
Adam Tauber's avatar
Adam Tauber committed
95
96
97
98
        try:
            res_url = r.xpath(url_xpath)[-1]
        except:
            continue
99

Adam Tauber's avatar
Adam Tauber committed
100
        if not res_url:
asciimoo's avatar
asciimoo committed
101
            continue
102

Cqoicebordel's avatar
Cqoicebordel committed
103
104
        title = extract_text(r.xpath(title_xpath))
        content = extract_text(r.xpath(content_xpath))
105
106

        # append result
Adam Tauber's avatar
Adam Tauber committed
107
108
        results.append({'title': title,
                        'content': content,
109
                        'url': res_url})
Adam Tauber's avatar
Adam Tauber committed
110

111
    # return results
asciimoo's avatar
asciimoo committed
112
    return results
113
114
115


# get supported languages from their site
116
def _fetch_supported_languages(resp):
117
118

    # response is a js file with regions as an embedded object
119
    response_page = resp.text
120
121
122
123
124
125
    response_page = response_page[response_page.find('regions:{') + 8:]
    response_page = response_page[:response_page.find('}') + 1]

    regions_json = loads(response_page)
    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())

126
    return list(supported_languages)