wikipedia.py 3.83 KB
Newer Older
a01200356's avatar
a01200356 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
"""
 Wikipedia (Web)

 @website     https://{language}.wikipedia.org
 @provide-api yes

 @using-api   yes
 @results     JSON
 @stable      yes
 @parse       url, infobox
"""

from json import loads
14
from lxml.html import fromstring
Adam Tauber's avatar
Adam Tauber committed
15
from searx.url_utils import quote, urlencode
16
from searx.utils import match_language
17

a01200356's avatar
a01200356 committed
18
# search-url
Adam Tauber's avatar
Adam Tauber committed
19
20
base_url = u'https://{language}.wikipedia.org/'
search_url = base_url + u'w/api.php?'\
a01200356's avatar
a01200356 committed
21
22
23
24
25
26
27
28
    'action=query'\
    '&format=json'\
    '&{query}'\
    '&prop=extracts|pageimages'\
    '&exintro'\
    '&explaintext'\
    '&pithumbsize=300'\
    '&redirects'
29
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
a01200356's avatar
a01200356 committed
30
31
32
33


# set language in base_url
def url_lang(lang):
34
    lang_pre = lang.split('-')[0]
Noémi Ványi's avatar
Noémi Ványi committed
35
    if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
36
        return 'en'
37
    return match_language(lang, supported_languages, language_aliases).split('-')[0]
a01200356's avatar
a01200356 committed
38
39
40
41
42


# do search-request
def request(query, params):
    if query.islower():
Adam Tauber's avatar
Adam Tauber committed
43
        query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
a01200356's avatar
a01200356 committed
44

Adam Tauber's avatar
Adam Tauber committed
45
46
    params['url'] = search_url.format(query=urlencode({'titles': query}),
                                      language=url_lang(params['language']))
a01200356's avatar
a01200356 committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

    return params


# get first meaningful paragraph
# this should filter out disambiguation pages and notes above first paragraph
# "magic numbers" were obtained by fine tuning
def extract_first_paragraph(content, title, image):
    first_paragraph = None

    failed_attempts = 0
    for paragraph in content.split('\n'):

        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
        length = len(paragraph)

        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
            first_paragraph = paragraph
            break

        failed_attempts += 1
        if failed_attempts > 3:
            return None

    return first_paragraph


# get response from search-request
def response(resp):
    results = []

Adam Tauber's avatar
Adam Tauber committed
78
    search_result = loads(resp.text)
a01200356's avatar
a01200356 committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

    # wikipedia article's unique id
    # first valid id is assumed to be the requested article
    for article_id in search_result['query']['pages']:
        page = search_result['query']['pages'][article_id]
        if int(article_id) > 0:
            break

    if int(article_id) < 0:
        return []

    title = page.get('title')

    image = page.get('thumbnail')
    if image:
        image = image.get('source')

    extract = page.get('extract')

    summary = extract_first_paragraph(extract, title, image)

    # link to wikipedia article
Adam Tauber's avatar
Adam Tauber committed
101
    wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
marc's avatar
marc committed
102
        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
a01200356's avatar
a01200356 committed
103
104
105
106
107
108
109
110
111
112

    results.append({'url': wikipedia_link, 'title': title})

    results.append({'infobox': title,
                    'id': wikipedia_link,
                    'content': summary,
                    'img_src': image,
                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})

    return results
113
114
115


# get supported languages from their site
116
def _fetch_supported_languages(resp):
117
    supported_languages = {}
118
    dom = fromstring(resp.text)
119
120
121
122
123
124
125
126
127
128
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            td = tr.xpath('./td')
            code = td[3].xpath('./a')[0].text
            name = td[2].xpath('./a')[0].text
            english_name = td[1].xpath('./a')[0].text
            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
marc's avatar
marc committed
129
            # exclude languages with too few articles
130
            if articles >= 100:
131
132
133
                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}

    return supported_languages