Merge branch '79-fix-ina-engine' into 'master' (ebdfdcde) · Commits · e / infra / spot

searx/engines/ina.py

+11 −33

Original line number	Original line	Diff line number	Diff line
	@@ -3,11 +3,9 @@
	INA (Videos)		INA (Videos)
	"""		"""

	from json import loads
	from html import unescape		from html import unescape
	from urllib.parse import urlencode		from urllib.parse import urlencode
	from lxml import html		from lxml import html
	from dateutil import parser
	from searx.utils import extract_text		from searx.utils import extract_text

	# about		# about
	@@ -23,25 +21,23 @@ about = {
	# engine dependent config		# engine dependent config
	categories = ['videos']		categories = ['videos']
	paging = True		paging = True
	page_size = 48		page_size = 12

	# search-url		# search-url
	base_url = 'https://www.ina.fr'		base_url = 'https://www.ina.fr'
	search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'		search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'

	# specific xpath variables		# specific xpath variables
	results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'		results_xpath = '//div[@id="searchHits"]/div'
	url_xpath = './/a/@href'		url_xpath = './/a/@href'
	title_xpath = './/h3[@class="h3--title media-heading"]'		title_xpath = './/div[contains(@class,"title-bloc-small")]'
	thumbnail_xpath = './/img/@src'		thumbnail_xpath = './/img/@data-src'
	publishedDate_xpath = './/span[@class="broadcast"]'		publishedDate_xpath = '//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]'
	content_xpath = './/p[@class="media-body__summary"]'


	# do search-request		# do search-request
	def request(query, params):		def request(query, params):
	params['url'] = search_url.format(ps=page_size,		params['url'] = search_url.format(start=params['pageno'] * page_size,
	start=params['pageno'] * page_size,
	query=urlencode({'q': query}))		query=urlencode({'q': query}))

	return params		return params
	@@ -51,34 +47,16 @@ def request(query, params):
	def response(resp):		def response(resp):
	results = []		results = []

	# we get html in a JSON container...		dom = html.fromstring(resp.text)
	response = loads(resp.text)
	dom = html.fromstring(response)

	# parse results		# parse results
	for result in dom.xpath(results_xpath):		for result in dom.xpath(results_xpath):
	videoid = result.xpath(url_xpath)[0]		url_relative = result.xpath(url_xpath)[0]
	url = base_url + videoid		url = base_url + url_relative
	title = unescape(extract_text(result.xpath(title_xpath)))		title = unescape(extract_text(result.xpath(title_xpath)))
	try:		thumbnail = extract_text(result.xpath(thumbnail_xpath))
	thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
	except:
	thumbnail = ''
	if thumbnail and thumbnail[0] == '/':
	thumbnail = base_url + thumbnail
	d = extract_text(result.xpath(publishedDate_xpath)[0])
	d = d.split('/')
	# force ISO date to avoid wrong parsing
	d = "%s-%s-%s" % (d[2], d[1], d[0])
	publishedDate = parser.parse(d)
	content = extract_text(result.xpath(content_xpath))

	# append result
	results.append({'url': url,		results.append({'url': url,
	'title': title,		'title': title,
	'content': content,
	'template': 'videos.html',		'template': 'videos.html',
	'publishedDate': publishedDate,
	'thumbnail': thumbnail})		'thumbnail': thumbnail})

	# return results		# return results