Unverified Commit 491792c1 authored by Noémi Ványi's avatar Noémi Ványi Committed by GitHub
Browse files

Merge pull request #1446 from MarcAbonce/language_aliases_fix

[fix] Fix queries in Hebrew and Norwegian so they give results in the right language
parents 35d82ed6 1a850cf1
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -113,8 +113,7 @@ def load_engine(engine_data):
iso_lang not in getattr(engine, 'supported_languages'):
language_aliases[iso_lang] = engine_lang
if language_aliases:
setattr(engine, 'language_aliases', language_aliases)
setattr(engine, 'language_aliases', language_aliases)
# assign language fetching method if auxiliary method exists
if hasattr(engine, '_fetch_supported_languages'):
......
......@@ -55,7 +55,7 @@ def request(query, params):
query=urlencode({'q': query}),
offset=offset)
language = match_language(params['language'], supported_languages).lower()
language = match_language(params['language'], supported_languages, language_aliases).lower()
params['cookies']['SRCHHPGUSR'] = \
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
......
......@@ -48,7 +48,7 @@ def request(query, params):
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# language cookie
language = match_language(params['language'], supported_languages).lower()
language = match_language(params['language'], supported_languages, language_aliases).lower()
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
# query and paging
......
......@@ -166,7 +166,7 @@ def extract_text_from_dom(result, xpath):
def request(query, params):
offset = (params['pageno'] - 1) * 10
language = match_language(params['language'], supported_languages)
language = match_language(params['language'], supported_languages, language_aliases)
language_array = language.split('-')
if params['language'].find('-') > 0:
country = params['language'].split('-')[1]
......@@ -381,10 +381,10 @@ def attributes_to_html(attributes):
def _fetch_supported_languages(resp):
supported_languages = {}
dom = html.fromstring(resp.text)
options = dom.xpath('//table//td/font/label/span')
options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]')
for option in options:
code = option.xpath('./@id')[0][1:]
name = option.text.title()
code = option.xpath('./@value')[0].split('_')[-1]
name = option.xpath('./@data-name')[0].title()
supported_languages[code] = {"name": name}
return supported_languages
......@@ -51,7 +51,7 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
search_options=urlencode(search_options))
language = match_language(params['language'], supported_languages).split('-')[0]
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
if language:
params['url'] += '&lr=lang_' + language
......
......@@ -46,7 +46,7 @@ def request(query, params):
offset=offset)
# add language tag
language = match_language(params['language'], supported_languages)
language = match_language(params['language'], supported_languages, language_aliases)
params['url'] += '&locale=' + language.replace('-', '_').lower()
return params
......
......@@ -36,7 +36,7 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
# do search-request
def request(query, params):
region = match_language(params['language'], supported_languages)
region = match_language(params['language'], supported_languages, language_aliases)
ui_language = region.split('-')[0]
search_path = search_string.format(
......
......@@ -68,7 +68,7 @@ def response(resp):
html = fromstring(resp.text)
search_results = html.xpath(wikidata_ids_xpath)
language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
# TODO: make requests asynchronous to avoid timeout when result_count > 1
for search_result in search_results[:result_count]:
......
......@@ -31,7 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
def url_lang(lang):
return match_language(lang, supported_languages).split('-')[0]
return match_language(lang, supported_languages, language_aliases).split('-')[0]
# do search-request
......
......@@ -9,6 +9,7 @@ class TestBingImagesEngine(SearxTestCase):
def test_request(self):
bing_images.supported_languages = ['fr-FR', 'en-US']
bing_images.language_aliases = {}
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
......
......@@ -9,6 +9,7 @@ class TestBingVideosEngine(SearxTestCase):
def test_request(self):
bing_videos.supported_languages = ['fr-FR', 'en-US']
bing_videos.language_aliases = {}
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
......
......@@ -15,7 +15,8 @@ class TestGoogleEngine(SearxTestCase):
return response
def test_request(self):
google.supported_languages = ['en', 'fr', 'zh-CN']
google.supported_languages = ['en', 'fr', 'zh-CN', 'iw']
google.language_aliases = {'he': 'iw'}
query = 'test_query'
dicto = defaultdict(dict)
......@@ -41,6 +42,12 @@ class TestGoogleEngine(SearxTestCase):
self.assertIn('zh-CN', params['url'])
self.assertIn('zh-CN', params['headers']['Accept-Language'])
dicto['language'] = 'he'
params = google.request(query, dicto)
self.assertIn('google.com', params['url'])
self.assertIn('iw', params['url'])
self.assertIn('iw', params['headers']['Accept-Language'])
def test_response(self):
self.assertRaises(AttributeError, google.response, None)
self.assertRaises(AttributeError, google.response, [])
......@@ -198,29 +205,13 @@ class TestGoogleEngine(SearxTestCase):
html = u"""
<html>
<body>
<table>
<tbody>
<tr>
<td>
<font>
<label>
<span id="ten">English</span>
</label>
</font>
</td>
<td>
<font>
<label>
<span id="tzh-CN">中文 (简体)</span>
</label>
<label>
<span id="tzh-TW">中文 (繁體)</span>
</label>
</font>
</td>
</tr>
</tbody>
</table>
<div id="langSec">
<div>
<input name="lr" data-name="english" value="lang_en" />
<input name="lr" data-name="中文 (简体)" value="lang_zh-CN" />
<input name="lr" data-name="中文 (繁體)" value="lang_zh-TW" />
</div>
</div>
</body>
</html>
"""
......
......@@ -10,6 +10,7 @@ class TestGoogleNewsEngine(SearxTestCase):
def test_request(self):
google_news.supported_languages = ['en-US', 'fr-FR']
google_news.language_aliases = {}
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
......
......@@ -8,6 +8,7 @@ class TestQwantEngine(SearxTestCase):
def test_request(self):
qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
qwant.language_aliases = {}
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
......
......@@ -8,6 +8,7 @@ class TestSwisscowsEngine(SearxTestCase):
def test_request(self):
swisscows.supported_languages = ['de-AT', 'de-DE']
swisscows.language_aliases = {}
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
......
......@@ -27,6 +27,7 @@ class TestWikidataEngine(SearxTestCase):
self.assertRaises(AttributeError, wikidata.response, '[]')
wikidata.supported_languages = ['en', 'es']
wikidata.language_aliases = {}
response = mock.Mock(text='<html></html>', search_params={"language": "en"})
self.assertEqual(wikidata.response(response), [])
......
......@@ -8,7 +8,8 @@ from searx.testing import SearxTestCase
class TestWikipediaEngine(SearxTestCase):
def test_request(self):
wikipedia.supported_languages = ['fr', 'en']
wikipedia.supported_languages = ['fr', 'en', 'no']
wikipedia.language_aliases = {'nb': 'no'}
query = 'test_query'
dicto = defaultdict(dict)
......@@ -25,9 +26,13 @@ class TestWikipediaEngine(SearxTestCase):
self.assertIn('Test_Query', params['url'])
self.assertNotIn('test_query', params['url'])
dicto['language'] = 'nb'
params = wikipedia.request(query, dicto)
self.assertIn('no.wikipedia.org', params['url'])
dicto['language'] = 'xx'
params = wikipedia.request(query, dicto)
self.assertIn('en', params['url'])
self.assertIn('en.wikipedia.org', params['url'])
def test_response(self):
dicto = defaultdict(dict)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment