wikidata.py 16.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# -*- coding: utf-8 -*-
"""
 Wikidata

 @website     https://wikidata.org
 @provide-api yes (https://wikidata.org/w/api.php)

 @using-api   partially (most things require scraping)
 @results     JSON, HTML
 @stable      no (html can change)
 @parse       url, infobox
"""
13
14

from searx import logger
15
from searx.poolrequests import get
16
from searx.engines.xpath import extract_text
17
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
Adam Tauber's avatar
Adam Tauber committed
18
from searx.url_utils import urlencode
19
from searx.utils import match_language
Dalf's avatar
Dalf committed
20

21
from json import loads
a01200356's avatar
a01200356 committed
22
from lxml.html import fromstring
23
24

logger = logger.getChild('wikidata')
25
result_count = 1
26
27

# urls
28
wikidata_host = 'https://www.wikidata.org'
a01200356's avatar
a01200356 committed
29
url_search = wikidata_host \
30
    + '/w/index.php?{query}'
a01200356's avatar
a01200356 committed
31

32
33
wikidata_api = wikidata_host + '/w/api.php'
url_detail = wikidata_api\
34
35
36
37
    + '?action=parse&format=json&{query}'\
    + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\
    + '&disableeditsection=1&disabletidy=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'

38
39
url_map = 'https://www.openstreetmap.org/'\
    + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
marc's avatar
marc committed
40
url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
a01200356's avatar
a01200356 committed
41

42
# xpaths
43
wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href'
44
45
46
47
48
49
50
51
52
53
54
55
56
title_xpath = '//*[contains(@class,"wikibase-title-label")]'
description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
property_xpath = '//div[@id="{propertyid}"]'
label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
    + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
    + '/*/div[contains(@class,"wikibase-snakview-value")]'
language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
Léo Bourrel's avatar
Léo Bourrel committed
57
media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
Dalf's avatar
Dalf committed
58

Léo Bourrel's avatar
Léo Bourrel committed
59

Dalf's avatar
Dalf committed
60
def request(query, params):
61
    params['url'] = url_search.format(
62
        query=urlencode({'search': query}))
Dalf's avatar
Dalf committed
63
64
65
66
67
    return params


def response(resp):
    results = []
Adam Tauber's avatar
Adam Tauber committed
68
    html = fromstring(resp.text)
69
    search_results = html.xpath(wikidata_ids_xpath)
Dalf's avatar
Dalf committed
70

71
    language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
Adam Tauber's avatar
Adam Tauber committed
72

73
    # TODO: make requests asynchronous to avoid timeout when result_count > 1
74
75
    for search_result in search_results[:result_count]:
        wikidata_id = search_result.split('/')[-1]
Adam Tauber's avatar
Adam Tauber committed
76
        url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
77
        htmlresponse = get(url)
Adam Tauber's avatar
Adam Tauber committed
78
        jsonresponse = loads(htmlresponse.text)
79
        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
Dalf's avatar
Dalf committed
80

81
    return results
Dalf's avatar
Dalf committed
82

83

Adam Tauber's avatar
Adam Tauber committed
84
def getDetail(jsonresponse, wikidata_id, language, locale):
85
86
87
88
    results = []
    urls = []
    attributes = []

89
90
    title = jsonresponse.get('parse', {}).get('displaytitle', {})
    result = jsonresponse.get('parse', {}).get('text', {})
Adam Tauber's avatar
Adam Tauber committed
91

92
    if not title or not result:
93
        return results
Dalf's avatar
Dalf committed
94

95
96
97
98
    title = fromstring(title)
    for elem in title.xpath(language_fallback_xpath):
        elem.getparent().remove(elem)
    title = extract_text(title.xpath(title_xpath))
99

100
101
102
    result = fromstring(result)
    for elem in result.xpath(language_fallback_xpath):
        elem.getparent().remove(elem)
Dalf's avatar
Dalf committed
103

104
    description = extract_text(result.xpath(description_xpath))
Dalf's avatar
Dalf committed
105

106
    # URLS
107

108
109
    # official website
    add_url(urls, result, 'P856', results=results)
110

111
112
113
114
115
116
117
    # wikipedia
    wikipedia_link_count = 0
    wikipedia_link = get_wikilink(result, language + 'wiki')
    if wikipedia_link:
        wikipedia_link_count += 1
        urls.append({'title': 'Wikipedia (' + language + ')',
                     'url': wikipedia_link})
Dalf's avatar
Dalf committed
118
119

    if language != 'en':
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
        wikipedia_en_link = get_wikilink(result, 'enwiki')
        if wikipedia_en_link:
            wikipedia_link_count += 1
            urls.append({'title': 'Wikipedia (en)',
                         'url': wikipedia_en_link})

    # TODO: get_wiki_firstlanguage
    # if wikipedia_link_count == 0:

    # more wikis
    add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
    add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
    add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki')

    add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo')

    # musicbrainz
    add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
    add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
    add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
    add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')

    # IMDb
    add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
    # source code repository
    add_url(urls, result, 'P1324')
    # blog
    add_url(urls, result, 'P1581')
    # social media links
    add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
    add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
    add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/')
    add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/')
    add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/')

    urls.append({'title': 'Wikidata',
                 'url': 'https://www.wikidata.org/wiki/'
                 + wikidata_id + '?uselang=' + language})

    # INFOBOX ATTRIBUTES (ROWS)

marc's avatar
marc committed
161
    # DATES
162
163
164
165
166
167
168
169
170
171
172
173
    # inception date
    add_attribute(attributes, result, 'P571', date=True)
    # dissolution date
    add_attribute(attributes, result, 'P576', date=True)
    # start date
    add_attribute(attributes, result, 'P580', date=True)
    # end date
    add_attribute(attributes, result, 'P582', date=True)
    # date of birth
    add_attribute(attributes, result, 'P569', date=True)
    # date of death
    add_attribute(attributes, result, 'P570', date=True)
marc's avatar
marc committed
174
175
176
177
    # date of spacecraft launch
    add_attribute(attributes, result, 'P619', date=True)
    # date of spacecraft landing
    add_attribute(attributes, result, 'P620', date=True)
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

    # nationality
    add_attribute(attributes, result, 'P27')
    # country of origin
    add_attribute(attributes, result, 'P495')
    # country
    add_attribute(attributes, result, 'P17')
    # headquarters
    add_attribute(attributes, result, 'Q180')

    # PLACES
    # capital
    add_attribute(attributes, result, 'P36', trim=True)
    # head of state
    add_attribute(attributes, result, 'P35', trim=True)
    # head of government
    add_attribute(attributes, result, 'P6', trim=True)
    # type of government
    add_attribute(attributes, result, 'P122')
    # official language
    add_attribute(attributes, result, 'P37')
    # population
    add_attribute(attributes, result, 'P1082', trim=True)
    # area
    add_attribute(attributes, result, 'P2046')
    # currency
marc's avatar
marc committed
204
    add_attribute(attributes, result, 'P38', trim=True)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    # heigth (building)
    add_attribute(attributes, result, 'P2048')

    # MEDIA
    # platform (videogames)
    add_attribute(attributes, result, 'P400')
    # author
    add_attribute(attributes, result, 'P50')
    # creator
    add_attribute(attributes, result, 'P170')
    # director
    add_attribute(attributes, result, 'P57')
    # performer
    add_attribute(attributes, result, 'P175')
    # developer
    add_attribute(attributes, result, 'P178')
    # producer
    add_attribute(attributes, result, 'P162')
    # manufacturer
    add_attribute(attributes, result, 'P176')
    # screenwriter
    add_attribute(attributes, result, 'P58')
    # production company
    add_attribute(attributes, result, 'P272')
    # record label
    add_attribute(attributes, result, 'P264')
    # publisher
    add_attribute(attributes, result, 'P123')
marc's avatar
marc committed
233
234
235
236
    # original network
    add_attribute(attributes, result, 'P449')
    # distributor
    add_attribute(attributes, result, 'P750')
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
    # composer
    add_attribute(attributes, result, 'P86')
    # publication date
    add_attribute(attributes, result, 'P577', date=True)
    # genre
    add_attribute(attributes, result, 'P136')
    # original language
    add_attribute(attributes, result, 'P364')
    # isbn
    add_attribute(attributes, result, 'Q33057')
    # software license
    add_attribute(attributes, result, 'P275')
    # programming language
    add_attribute(attributes, result, 'P277')
    # version
    add_attribute(attributes, result, 'P348', trim=True)
    # narrative location
    add_attribute(attributes, result, 'P840')

    # LANGUAGES
    # number of speakers
    add_attribute(attributes, result, 'P1098')
    # writing system
    add_attribute(attributes, result, 'P282')
    # regulatory body
    add_attribute(attributes, result, 'P1018')
    # language code
    add_attribute(attributes, result, 'P218')

    # OTHER
    # ceo
    add_attribute(attributes, result, 'P169', trim=True)
    # founder
    add_attribute(attributes, result, 'P112')
    # legal form (company/organization)
    add_attribute(attributes, result, 'P1454')
marc's avatar
marc committed
273
274
275
276
    # operator
    add_attribute(attributes, result, 'P137')
    # crew members (tripulation)
    add_attribute(attributes, result, 'P1029')
277
278
279
280
281
282
283
284
285
286
287
288
    # taxon
    add_attribute(attributes, result, 'P225')
    # chemical formula
    add_attribute(attributes, result, 'P274')
    # winner (sports/contests)
    add_attribute(attributes, result, 'P1346')
    # number of deaths
    add_attribute(attributes, result, 'P1120')
    # currency code
    add_attribute(attributes, result, 'P498')

    image = add_image(result)
Dalf's avatar
Dalf committed
289

290
    if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
291
        results.append({
292
293
294
295
                       'url': urls[0]['url'],
                       'title': title,
                       'content': description
                       })
296
297
    else:
        results.append({
298
                       'infobox': title,
a01200356's avatar
a01200356 committed
299
                       'id': wikipedia_link,
300
                       'content': description,
301
                       'img_src': image,
302
303
304
                       'attributes': attributes,
                       'urls': urls
                       })
Dalf's avatar
Dalf committed
305
306
307

    return results

308

309
310
# only returns first match
def add_image(result):
marc's avatar
marc committed
311
312
    # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
    property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
313
314
315
316

    for property_id in property_ids:
        image = result.xpath(property_xpath.replace('{propertyid}', property_id))
        if image:
Léo Bourrel's avatar
Léo Bourrel committed
317
            image_name = image[0].xpath(media_xpath)
318
319
320
321
322
323
324
325
326
327
328
329
330
            image_src = url_image.replace('{filename}', extract_text(image_name[0]))
            return image_src


# setting trim will only returned high ranked rows OR the first row
def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False):
    attribute = result.xpath(property_xpath.replace('{propertyid}', property_id))
    if attribute:

        if default_label:
            label = default_label
        else:
            label = extract_text(attribute[0].xpath(label_xpath))
marc's avatar
marc committed
331
            label = label[0].upper() + label[1:]
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

        if date:
            trim = True
            # remove calendar name
            calendar_name = attribute[0].xpath(calendar_name_xpath)
            for calendar in calendar_name:
                calendar.getparent().remove(calendar)

        concat_values = ""
        values = []
        first_value = None
        for row in attribute[0].xpath(property_row_xpath):
            if not first_value or not trim or row.xpath(preferred_rank_xpath):

                value = row.xpath(value_xpath)
                if not value:
                    continue
                value = extract_text(value)

                # save first value in case no ranked row is found
                if trim and not first_value:
                    first_value = value
                else:
                    # to avoid duplicate values
                    if value not in values:
                        concat_values += value + ", "
                        values.append(value)

        if trim and not values:
            attributes.append({'label': label,
                               'value': first_value})
        else:
            attributes.append({'label': label,
                               'value': concat_values[:-2]})


# requires property_id unless it's a wiki link (defined in link_type)
def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None):
    links = []

    # wiki links don't have property in wikidata page
    if link_type and 'wiki' in link_type:
            links.append(get_wikilink(result, link_type))
375
    else:
376
377
378
379
380
        dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id))
        if dom_element:
            dom_element = dom_element[0]
            if not default_label:
                label = extract_text(dom_element.xpath(label_xpath))
marc's avatar
marc committed
381
                label = label[0].upper() + label[1:]
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

            if link_type == 'geo':
                links.append(get_geolink(dom_element))

            elif link_type == 'imdb':
                links.append(get_imdblink(dom_element, url_prefix))

            else:
                url_results = dom_element.xpath(url_xpath)
                for link in url_results:
                    if link is not None:
                        if url_prefix:
                            link = url_prefix + extract_text(link)
                        else:
                            link = extract_text(link)
                        links.append(link)

    # append urls
    for url in links:
        if url is not None:
            urls.append({'title': default_label or label,
                         'url': url})
            if results is not None:
                results.append({'title': default_label or label,
                                'url': url})


def get_imdblink(result, url_prefix):
    imdb_id = result.xpath(value_xpath)
    if imdb_id:
        imdb_id = extract_text(imdb_id)
        id_prefix = imdb_id[:2]
        if id_prefix == 'tt':
            url = url_prefix + 'title/' + imdb_id
        elif id_prefix == 'nm':
            url = url_prefix + 'name/' + imdb_id
        elif id_prefix == 'ch':
            url = url_prefix + 'character/' + imdb_id
        elif id_prefix == 'co':
            url = url_prefix + 'company/' + imdb_id
        elif id_prefix == 'ev':
            url = url_prefix + 'event/' + imdb_id
        else:
            url = None
        return url
427

428

429
430
431
def get_geolink(result):
    coordinates = result.xpath(value_xpath)
    if not coordinates:
Dalf's avatar
Dalf committed
432
        return None
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
    coordinates = extract_text(coordinates[0])
    latitude, longitude = coordinates.split(',')

    # convert to decimal
    lat = int(latitude[:latitude.find(u'°')])
    if latitude.find('\'') >= 0:
        lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0
    if latitude.find('"') >= 0:
        lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
    if latitude.find('S') >= 0:
        lat *= -1
    lon = int(longitude[:longitude.find(u'°')])
    if longitude.find('\'') >= 0:
        lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0
    if longitude.find('"') >= 0:
        lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
    if longitude.find('W') >= 0:
        lon *= -1

    # TODO: get precision
    precision = 0.0002
Adam Tauber's avatar
Adam Tauber committed
454
    # there is no zoom information, deduce from precision (error prone)
Dalf's avatar
Dalf committed
455
456
457
458
459
    # samples :
    # 13 --> 5
    # 1 --> 6
    # 0.016666666666667 --> 9
    # 0.00027777777777778 --> 19
460
461
    # wolframalpha :
    # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
Dalf's avatar
Dalf committed
462
463
464
465
    # 14.1186-8.8322 x+0.625447 x^2
    if precision < 0.0003:
        zoom = 19
    else:
Adam Tauber's avatar
Adam Tauber committed
466
        zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
Dalf's avatar
Dalf committed
467

468
    url = url_map\
469
470
        .replace('{latitude}', str(lat))\
        .replace('{longitude}', str(lon))\
471
        .replace('{zoom}', str(zoom))
Dalf's avatar
Dalf committed
472
473
474

    return url

475

Dalf's avatar
Dalf committed
476
def get_wikilink(result, wikiid):
477
478
479
480
481
    url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid))
    if not url:
        return None
    url = url[0]
    if url.startswith('http://'):
Dalf's avatar
Dalf committed
482
483
484
485
        url = url.replace('http://', 'https://')
    elif url.startswith('//'):
        url = 'https:' + url
    return url