Commit ebdfdcde authored by Nivesh Krishna's avatar Nivesh Krishna
Browse files

Merge branch '79-fix-ina-engine' into 'master'

Update ina engine xpath values

Closes #79

See merge request e/cloud/my-spot!98
parents 8ebc5827 a45408e8
...@@ -3,11 +3,9 @@ ...@@ -3,11 +3,9 @@
INA (Videos) INA (Videos)
""" """
from json import loads
from html import unescape from html import unescape
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from dateutil import parser
from searx.utils import extract_text from searx.utils import extract_text
# about # about
...@@ -23,25 +21,23 @@ about = { ...@@ -23,25 +21,23 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
page_size = 48 page_size = 12
# search-url # search-url
base_url = 'https://www.ina.fr' base_url = 'https://www.ina.fr'
search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables # specific xpath variables
results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href' url_xpath = './/a/@href'
title_xpath = './/h3[@class="h3--title media-heading"]' title_xpath = './/div[contains(@class,"title-bloc-small")]'
thumbnail_xpath = './/img/@src' thumbnail_xpath = './/img/@data-src'
publishedDate_xpath = './/span[@class="broadcast"]' publishedDate_xpath = '//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]'
content_xpath = './/p[@class="media-body__summary"]'
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(ps=page_size, params['url'] = search_url.format(start=params['pageno'] * page_size,
start=params['pageno'] * page_size,
query=urlencode({'q': query})) query=urlencode({'q': query}))
return params return params
...@@ -51,34 +47,16 @@ def request(query, params): ...@@ -51,34 +47,16 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
# we get html in a JSON container... dom = html.fromstring(resp.text)
response = loads(resp.text)
dom = html.fromstring(response)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
videoid = result.xpath(url_xpath)[0] url_relative = result.xpath(url_xpath)[0]
url = base_url + videoid url = base_url + url_relative
title = unescape(extract_text(result.xpath(title_xpath))) title = unescape(extract_text(result.xpath(title_xpath)))
try: thumbnail = extract_text(result.xpath(thumbnail_xpath))
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
except:
thumbnail = ''
if thumbnail and thumbnail[0] == '/':
thumbnail = base_url + thumbnail
d = extract_text(result.xpath(publishedDate_xpath)[0])
d = d.split('/')
# force ISO date to avoid wrong parsing
d = "%s-%s-%s" % (d[2], d[1], d[0])
publishedDate = parser.parse(d)
content = extract_text(result.xpath(content_xpath))
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content,
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
# return results # return results
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment