Loading searx/engines/ina.py +11 −33 Original line number Original line Diff line number Diff line Loading @@ -3,11 +3,9 @@ INA (Videos) INA (Videos) """ """ from json import loads from html import unescape from html import unescape from urllib.parse import urlencode from urllib.parse import urlencode from lxml import html from lxml import html from dateutil import parser from searx.utils import extract_text from searx.utils import extract_text # about # about Loading @@ -23,25 +21,23 @@ about = { # engine dependent config # engine dependent config categories = ['videos'] categories = ['videos'] paging = True paging = True page_size = 48 page_size = 12 # search-url # search-url base_url = 'https://www.ina.fr' base_url = 'https://www.ina.fr' search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size' # specific xpath variables # specific xpath variables results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' results_xpath = '//div[@id="searchHits"]/div' url_xpath = './/a/@href' url_xpath = './/a/@href' title_xpath = './/h3[@class="h3--title media-heading"]' title_xpath = './/div[contains(@class,"title-bloc-small")]' thumbnail_xpath = './/img/@src' thumbnail_xpath = './/img/@data-src' publishedDate_xpath = './/span[@class="broadcast"]' publishedDate_xpath = '//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]' content_xpath = './/p[@class="media-body__summary"]' # do search-request # do search-request def request(query, params): def request(query, params): params['url'] = search_url.format(ps=page_size, params['url'] = search_url.format(start=params['pageno'] * page_size, start=params['pageno'] * page_size, query=urlencode({'q': query})) query=urlencode({'q': query})) return params return params Loading @@ -51,34 +47,16 @@ def request(query, params): def response(resp): def response(resp): results = [] results = [] # we get html in a JSON container... dom = html.fromstring(resp.text) response = loads(resp.text) dom = html.fromstring(response) # parse results # parse results for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath): videoid = result.xpath(url_xpath)[0] url_relative = result.xpath(url_xpath)[0] url = base_url + videoid url = base_url + url_relative title = unescape(extract_text(result.xpath(title_xpath))) title = unescape(extract_text(result.xpath(title_xpath))) try: thumbnail = extract_text(result.xpath(thumbnail_xpath)) thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) except: thumbnail = '' if thumbnail and thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') # force ISO date to avoid wrong parsing d = "%s-%s-%s" % (d[2], d[1], d[0]) publishedDate = parser.parse(d) content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': url, results.append({'url': url, 'title': title, 'title': title, 'content': content, 'template': 'videos.html', 'template': 'videos.html', 'publishedDate': publishedDate, 'thumbnail': thumbnail}) 'thumbnail': thumbnail}) # return results # return results Loading Loading
searx/engines/ina.py +11 −33 Original line number Original line Diff line number Diff line Loading @@ -3,11 +3,9 @@ INA (Videos) INA (Videos) """ """ from json import loads from html import unescape from html import unescape from urllib.parse import urlencode from urllib.parse import urlencode from lxml import html from lxml import html from dateutil import parser from searx.utils import extract_text from searx.utils import extract_text # about # about Loading @@ -23,25 +21,23 @@ about = { # engine dependent config # engine dependent config categories = ['videos'] categories = ['videos'] paging = True paging = True page_size = 48 page_size = 12 # search-url # search-url base_url = 'https://www.ina.fr' base_url = 'https://www.ina.fr' search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size' # specific xpath variables # specific xpath variables results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' results_xpath = '//div[@id="searchHits"]/div' url_xpath = './/a/@href' url_xpath = './/a/@href' title_xpath = './/h3[@class="h3--title media-heading"]' title_xpath = './/div[contains(@class,"title-bloc-small")]' thumbnail_xpath = './/img/@src' thumbnail_xpath = './/img/@data-src' publishedDate_xpath = './/span[@class="broadcast"]' publishedDate_xpath = '//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]' content_xpath = './/p[@class="media-body__summary"]' # do search-request # do search-request def request(query, params): def request(query, params): params['url'] = search_url.format(ps=page_size, params['url'] = search_url.format(start=params['pageno'] * page_size, start=params['pageno'] * page_size, query=urlencode({'q': query})) query=urlencode({'q': query})) return params return params Loading @@ -51,34 +47,16 @@ def request(query, params): def response(resp): def response(resp): results = [] results = [] # we get html in a JSON container... dom = html.fromstring(resp.text) response = loads(resp.text) dom = html.fromstring(response) # parse results # parse results for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath): videoid = result.xpath(url_xpath)[0] url_relative = result.xpath(url_xpath)[0] url = base_url + videoid url = base_url + url_relative title = unescape(extract_text(result.xpath(title_xpath))) title = unescape(extract_text(result.xpath(title_xpath))) try: thumbnail = extract_text(result.xpath(thumbnail_xpath)) thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) except: thumbnail = '' if thumbnail and thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') # force ISO date to avoid wrong parsing d = "%s-%s-%s" % (d[2], d[1], d[0]) publishedDate = parser.parse(d) content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': url, results.append({'url': url, 'title': title, 'title': title, 'content': content, 'template': 'videos.html', 'template': 'videos.html', 'publishedDate': publishedDate, 'thumbnail': thumbnail}) 'thumbnail': thumbnail}) # return results # return results Loading