Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ebdfdcde authored by Nivesh Krishna's avatar Nivesh Krishna
Browse files

Merge branch '79-fix-ina-engine' into 'master'

Update ina engine xpath values

Closes #79

See merge request e/cloud/my-spot!98
parents 8ebc5827 a45408e8
Loading
Loading
Loading
Loading
+11 −33
Original line number Original line Diff line number Diff line
@@ -3,11 +3,9 @@
 INA (Videos)
 INA (Videos)
"""
"""


from json import loads
from html import unescape
from html import unescape
from urllib.parse import urlencode
from urllib.parse import urlencode
from lxml import html
from lxml import html
from dateutil import parser
from searx.utils import extract_text
from searx.utils import extract_text


# about
# about
@@ -23,25 +21,23 @@ about = {
# engine dependent config
# engine dependent config
categories = ['videos']
categories = ['videos']
paging = True
paging = True
page_size = 48
page_size = 12


# search-url
# search-url
base_url = 'https://www.ina.fr'
base_url = 'https://www.ina.fr'
search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}'
search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'


# specific xpath variables
# specific xpath variables
results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]'
results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href'
url_xpath = './/a/@href'
title_xpath = './/h3[@class="h3--title media-heading"]'
title_xpath = './/div[contains(@class,"title-bloc-small")]'
thumbnail_xpath = './/img/@src'
thumbnail_xpath = './/img/@data-src'
publishedDate_xpath = './/span[@class="broadcast"]'
publishedDate_xpath = '//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]'
content_xpath = './/p[@class="media-body__summary"]'




# do search-request
# do search-request
def request(query, params):
def request(query, params):
    params['url'] = search_url.format(ps=page_size,
    params['url'] = search_url.format(start=params['pageno'] * page_size,
                                      start=params['pageno'] * page_size,
                                      query=urlencode({'q': query}))
                                      query=urlencode({'q': query}))


    return params
    return params
@@ -51,34 +47,16 @@ def request(query, params):
def response(resp):
def response(resp):
    results = []
    results = []


    # we get html in a JSON container...
    dom = html.fromstring(resp.text)
    response = loads(resp.text)
    dom = html.fromstring(response)

    # parse results
    # parse results
    for result in dom.xpath(results_xpath):
    for result in dom.xpath(results_xpath):
        videoid = result.xpath(url_xpath)[0]
        url_relative = result.xpath(url_xpath)[0]
        url = base_url + videoid
        url = base_url + url_relative
        title = unescape(extract_text(result.xpath(title_xpath)))
        title = unescape(extract_text(result.xpath(title_xpath)))
        try:
        thumbnail = extract_text(result.xpath(thumbnail_xpath))
            thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
        except:
            thumbnail = ''
        if thumbnail and thumbnail[0] == '/':
            thumbnail = base_url + thumbnail
        d = extract_text(result.xpath(publishedDate_xpath)[0])
        d = d.split('/')
        # force ISO date to avoid wrong parsing
        d = "%s-%s-%s" % (d[2], d[1], d[0])
        publishedDate = parser.parse(d)
        content = extract_text(result.xpath(content_xpath))

        # append result
        results.append({'url': url,
        results.append({'url': url,
                        'title': title,
                        'title': title,
                        'content': content,
                        'template': 'videos.html',
                        'template': 'videos.html',
                        'publishedDate': publishedDate,
                        'thumbnail': thumbnail})
                        'thumbnail': thumbnail})


    # return results
    # return results