fetch_languages.py 7.89 KB
Newer Older
1
2
3
4
5
6
7
# -*- coding: utf-8 -*-

# This script generates languages.py from intersecting each engine's supported languages.
#
# Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong.

8
from json import dump
9
10
import io
from sys import path
11
12
13
from babel import Locale, UnknownLocaleError
from babel.languages import get_global

14
path.append('../searx')  # noqa
marc's avatar
marc committed
15
16
from searx import settings
from searx.engines import initialize_engines, engines
17
18
19
20
21
22
23
24

# Output files.
engines_languages_file = 'engines_languages.json'
languages_file = 'languages.py'


# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
25
    engines_languages = {}
26
27
28
29
    for engine_name in engines:
        if hasattr(engines[engine_name], 'fetch_supported_languages'):
            try:
                engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
30
31
                if type(engines_languages[engine_name]) == list:
                    engines_languages[engine_name] = sorted(engines_languages[engine_name])
32
            except Exception as e:
33
                print(e)
34
35

    # write json file
marc's avatar
marc committed
36
    with io.open(engines_languages_file, "w", encoding="utf-8") as f:
37
        dump(engines_languages, f, ensure_ascii=False, indent=4, separators=(',', ': '))
38

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    return engines_languages


# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
    try:
        locale = Locale.parse(lang_code, sep='-')
        return locale
    except (UnknownLocaleError, ValueError):
        return None


# Append engine_name to list of engines that support locale.
def add_engine_counter(lang_code, engine_name, languages):
    if lang_code in languages:
        if 'counter' not in languages[lang_code]:
            languages[lang_code]['counter'] = [engine_name]
        elif engine_name not in languages[lang_code]['counter']:
            languages[lang_code]['counter'].append(engine_name)
58
59


60
61
62
63
# Join all language lists.
# TODO: Add language names from engine's language list if name not known by babel.
def join_language_lists(engines_languages):
    language_list = {}
64
    for engine_name in engines_languages:
65
66
67
        for lang_code in engines_languages[engine_name]:

            # apply custom fixes if necessary
68
69
70
            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
                                 if lang_code == alias)
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

            locale = get_locale(lang_code)

            # ensure that lang_code uses standard language and country codes
            if locale and locale.territory:
                lang_code = locale.language + '-' + locale.territory

            # add locale if it's not in list
            if lang_code not in language_list:
                if locale:
                    language_list[lang_code] = {'name': locale.get_language_name().title(),
                                                'english_name': locale.english_name,
                                                'country': locale.get_territory_name() or ''}

                    # also add language without country
                    if locale.language not in language_list:
                        language_list[locale.language] = {'name': locale.get_language_name().title(),
                                                          'english_name': locale.english_name}
                else:
                    language_list[lang_code] = {}

            # count engine for both language_country combination and language alone
            add_engine_counter(lang_code, engine_name, language_list)
            add_engine_counter(lang_code.split('-')[0], engine_name, language_list)

    return language_list


# Filter language list so it only includes the most supported languages and countries.
def filter_language_list(all_languages):
    min_supported_engines = 10
    main_engines = [engine_name for engine_name in engines.keys()
                    if 'general' in engines[engine_name].categories and
                       engines[engine_name].supported_languages and
                       not engines[engine_name].disabled]

    # filter list to include only languages supported by most engines or all default general engines
    filtered_languages = {code: lang for code, lang
                          in all_languages.items()
                          if (len(lang.get('counter', [])) >= min_supported_engines or
                              all(main_engine in lang.get('counter', [])
                                  for main_engine in main_engines))}

    return filtered_languages


# Add country codes to languages without one and filter out language codes.
def assign_country_codes(filtered_languages, all_languages):
    sorted_languages = sorted(all_languages,
                              key=lambda lang: len(all_languages[lang].get('counter', [])),
                              reverse=True)
    previous_lang = None
    previous_code = None
    countries = 0
    for current_code in sorted(filtered_languages):
        current_lang = current_code.split('-')[0]

        # count country codes per language
        if current_lang == previous_lang:
130
            countries += 1
131

132
        else:
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
            if previous_lang is not None:
                # if language has no single country code
                if countries == 0:
                    # try to get country code with most supported engines
                    for l in sorted_languages:
                        l_parts = l.split('-')
                        if len(l_parts) == 2 and l_parts[0] == previous_lang:
                            filtered_languages[l] = all_languages[l]
                            filtered_languages[l]['country'] = ''
                            countries = 1
                            break

                    if countries == 0:
                        # get most likely country code from babel
                        subtags = get_global('likely_subtags').get(previous_lang)
                        if subtags:
                            subtag_parts = subtags.split('_')
                            new_code = subtag_parts[0] + '-' + subtag_parts[-1]
                            filtered_languages[new_code] = all_languages[previous_lang]
                            countries = 1

                if countries == 1:
                    # remove countryless version of language if there's only one country
                    del filtered_languages[previous_lang]
                    if previous_code in filtered_languages:
                        filtered_languages[previous_code]['country'] = ''

160
            countries = 0
161
162
163
            previous_lang = current_lang

        previous_code = current_code
164
165
166


# Write languages.py.
167
def write_languages_file(languages):
168
    new_file = open(languages_file, 'wb')
marc's avatar
marc committed
169
170
171
172
    file_content = '# -*- coding: utf-8 -*-\n'\
                   + '# list of language codes\n'\
                   + '# this file is generated automatically by utils/update_search_languages.py\n'\
                   + '\nlanguage_codes = ('
173
174
175
176
177
178
179
180
181
182
183
184
185
    for code in sorted(languages):
        file_content += '\n    (u"' + code + '"'\
                        + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
                        + ', u"' + languages[code].get('country', '') + '"'\
                        + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
    # remove last comma
    file_content = file_content[:-1]
    file_content += '\n)\n'
    new_file.write(file_content.encode('utf8'))
    new_file.close()


if __name__ == "__main__":
186
187
188
189
190
191
    initialize_engines(settings['engines'])
    engines_languages = fetch_supported_languages()
    all_languages = join_language_lists(engines_languages)
    filtered_languages = filter_language_list(all_languages)
    assign_country_codes(filtered_languages, all_languages)
    write_languages_file(filtered_languages)