Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
e
infra
spot
Commits
a45408e8
Commit
a45408e8
authored
Jan 25, 2022
by
Israel Yago Pereira
Committed by
Nivesh Krishna
Jan 25, 2022
Browse files
Update ina engine xpath values
parent
b7f8aadc
Changes
1
Hide whitespace changes
Inline
Side-by-side
searx/engines/ina.py
View file @
a45408e8
...
...
@@ -3,11 +3,9 @@
INA (Videos)
"""
from
json
import
loads
from
html
import
unescape
from
urllib.parse
import
urlencode
from
lxml
import
html
from
dateutil
import
parser
from
searx.utils
import
extract_text
# about
...
...
@@ -23,25 +21,23 @@ about = {
# engine dependent config
categories
=
[
'videos'
]
paging
=
True
page_size
=
48
page_size
=
12
# search-url
base_url
=
'https://www.ina.fr'
search_url
=
base_url
+
'/
layout/set/
ajax/recherche
/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}
'
search_url
=
base_url
+
'/ajax/recherche
?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size
'
# specific xpath variables
results_xpath
=
'//div[
contains(@class,"search-results--list")]//div[@class="media-body"]
'
results_xpath
=
'//div[
@id="searchHits"]/div
'
url_xpath
=
'.//a/@href'
title_xpath
=
'.//h3[@class="h3--title media-heading"]'
thumbnail_xpath
=
'.//img/@src'
publishedDate_xpath
=
'.//span[@class="broadcast"]'
content_xpath
=
'.//p[@class="media-body__summary"]'
title_xpath
=
'.//div[contains(@class,"title-bloc-small")]'
thumbnail_xpath
=
'.//img/@data-src'
publishedDate_xpath
=
'//div[@id="searchHits"]//div[contains(@class,"dateAgenda")]'
# do search-request
def
request
(
query
,
params
):
params
[
'url'
]
=
search_url
.
format
(
ps
=
page_size
,
start
=
params
[
'pageno'
]
*
page_size
,
params
[
'url'
]
=
search_url
.
format
(
start
=
params
[
'pageno'
]
*
page_size
,
query
=
urlencode
({
'q'
:
query
}))
return
params
...
...
@@ -51,34 +47,16 @@ def request(query, params):
def
response
(
resp
):
results
=
[]
# we get html in a JSON container...
response
=
loads
(
resp
.
text
)
dom
=
html
.
fromstring
(
response
)
dom
=
html
.
fromstring
(
resp
.
text
)
# parse results
for
result
in
dom
.
xpath
(
results_xpath
):
videoid
=
result
.
xpath
(
url_xpath
)[
0
]
url
=
base_url
+
videoid
url_relative
=
result
.
xpath
(
url_xpath
)[
0
]
url
=
base_url
+
url_relative
title
=
unescape
(
extract_text
(
result
.
xpath
(
title_xpath
)))
try
:
thumbnail
=
extract_text
(
result
.
xpath
(
thumbnail_xpath
)[
0
])
except
:
thumbnail
=
''
if
thumbnail
and
thumbnail
[
0
]
==
'/'
:
thumbnail
=
base_url
+
thumbnail
d
=
extract_text
(
result
.
xpath
(
publishedDate_xpath
)[
0
])
d
=
d
.
split
(
'/'
)
# force ISO date to avoid wrong parsing
d
=
"%s-%s-%s"
%
(
d
[
2
],
d
[
1
],
d
[
0
])
publishedDate
=
parser
.
parse
(
d
)
content
=
extract_text
(
result
.
xpath
(
content_xpath
))
# append result
thumbnail
=
extract_text
(
result
.
xpath
(
thumbnail_xpath
))
results
.
append
({
'url'
:
url
,
'title'
:
title
,
'content'
:
content
,
'template'
:
'videos.html'
,
'publishedDate'
:
publishedDate
,
'thumbnail'
:
thumbnail
})
# return results
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment