Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
e
infra
spot
Commits
ebdfdcde
Commit
ebdfdcde
authored
Jan 25, 2022
by
Nivesh Krishna
Browse files
Merge branch '79-fix-ina-engine' into 'master'
Update ina engine xpath values Closes #79 See merge request e/cloud/my-spot!98
parents
8ebc5827
a45408e8
Changes
1
Hide whitespace changes
Inline
Side-by-side
searx/engines/ina.py
View file @
ebdfdcde
...
...
@@ -3,11 +3,9 @@
INA (Videos)
"""
from
json
import
loads
from
html
import
unescape
from
urllib.parse
import
urlencode
from
lxml
import
html
from
dateutil
import
parser
from
searx.utils
import
extract_text
# about
...
...
@@ -23,25 +21,23 @@ about = {
# engine dependent config
categories
=
[
'videos'
]
paging
=
True
page_size
=
48
page_size
=
12
# search-url
base_url
=
'https://www.ina.fr'
search_url
=
base_url
+
'/
layout/set/
ajax/recherche
/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}
'
search_url
=
base_url
+
'/ajax/recherche
?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size
'
# specific xpath variables
results_xpath
=
'//div[
contains(@class,"search-results--list")]//div[@class="media-body"]
'
results_xpath
=
'//div[
@id="searchHits"]/div
'
url_xpath
=
'.//a/@href'
title_xpath
=
'.//h3[@class="h3--title media-heading"]'
thumbnail_xpath
=
'.//img/@src'
publishedDate_xpath
=
'.//span[@class="broadcast"]'
content_xpath
=
'.//p[@class="media-body__summary"]'
title_xpath
=
'.//div[contains(@class,"title-bloc-small")]'
thumbnail_xpath
=
'.//img/@data-src'
publishedDate_xpath
=
'//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]'
# do search-request
def
request
(
query
,
params
):
params
[
'url'
]
=
search_url
.
format
(
ps
=
page_size
,
start
=
params
[
'pageno'
]
*
page_size
,
params
[
'url'
]
=
search_url
.
format
(
start
=
params
[
'pageno'
]
*
page_size
,
query
=
urlencode
({
'q'
:
query
}))
return
params
...
...
@@ -51,34 +47,16 @@ def request(query, params):
def
response
(
resp
):
results
=
[]
# we get html in a JSON container...
response
=
loads
(
resp
.
text
)
dom
=
html
.
fromstring
(
response
)
dom
=
html
.
fromstring
(
resp
.
text
)
# parse results
for
result
in
dom
.
xpath
(
results_xpath
):
videoid
=
result
.
xpath
(
url_xpath
)[
0
]
url
=
base_url
+
videoid
url_relative
=
result
.
xpath
(
url_xpath
)[
0
]
url
=
base_url
+
url_relative
title
=
unescape
(
extract_text
(
result
.
xpath
(
title_xpath
)))
try
:
thumbnail
=
extract_text
(
result
.
xpath
(
thumbnail_xpath
)[
0
])
except
:
thumbnail
=
''
if
thumbnail
and
thumbnail
[
0
]
==
'/'
:
thumbnail
=
base_url
+
thumbnail
d
=
extract_text
(
result
.
xpath
(
publishedDate_xpath
)[
0
])
d
=
d
.
split
(
'/'
)
# force ISO date to avoid wrong parsing
d
=
"%s-%s-%s"
%
(
d
[
2
],
d
[
1
],
d
[
0
])
publishedDate
=
parser
.
parse
(
d
)
content
=
extract_text
(
result
.
xpath
(
content_xpath
))
# append result
thumbnail
=
extract_text
(
result
.
xpath
(
thumbnail_xpath
))
results
.
append
({
'url'
:
url
,
'title'
:
title
,
'content'
:
content
,
'template'
:
'videos.html'
,
'publishedDate'
:
publishedDate
,
'thumbnail'
:
thumbnail
})
# return results
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment