fetch_languages.py 7.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from intersecting each engine's supported languages.
  3. #
  4. # Output files (engines_languages.json and languages.py)
  5. # are written in current directory to avoid overwriting in case something goes wrong.
  6. from json import dump
  7. import io
  8. from sys import path
  9. from babel import Locale, UnknownLocaleError
  10. from babel.languages import get_global
  11. path.append('../searx') # noqa
  12. from searx import settings
  13. from searx.engines import initialize_engines, engines
  14. # Output files.
  15. engines_languages_file = 'engines_languages.json'
  16. languages_file = 'languages.py'
  17. # custom fixes for non standard locale codes
  18. # sl-SL is technically not invalid, but still a mistake
  19. # TODO: move to respective engines
  20. locale_fixes = {
  21. 'sl-sl': 'sl-SI',
  22. 'ar-xa': 'ar-SA',
  23. 'es-xl': 'es-419',
  24. 'zh-chs': 'zh-Hans-CN',
  25. 'zh-cht': 'zh-Hant-TW',
  26. 'tzh-tw': 'zh-Hant-TW',
  27. 'tzh-hk': 'zh-Hant-HK'
  28. }
  29. # Fetchs supported languages for each engine and writes json file with those.
  30. def fetch_supported_languages():
  31. engines_languages = {}
  32. for engine_name in engines:
  33. if hasattr(engines[engine_name], 'fetch_supported_languages'):
  34. try:
  35. engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
  36. except Exception as e:
  37. print(e)
  38. # write json file
  39. with io.open(engines_languages_file, "w", encoding="utf-8") as f:
  40. dump(engines_languages, f, ensure_ascii=False)
  41. return engines_languages
  42. # Get babel Locale object from lang_code if possible.
  43. def get_locale(lang_code):
  44. try:
  45. locale = Locale.parse(lang_code, sep='-')
  46. return locale
  47. except (UnknownLocaleError, ValueError):
  48. return None
  49. # Append engine_name to list of engines that support locale.
  50. def add_engine_counter(lang_code, engine_name, languages):
  51. if lang_code in languages:
  52. if 'counter' not in languages[lang_code]:
  53. languages[lang_code]['counter'] = [engine_name]
  54. elif engine_name not in languages[lang_code]['counter']:
  55. languages[lang_code]['counter'].append(engine_name)
  56. # Join all language lists.
  57. # TODO: Add language names from engine's language list if name not known by babel.
  58. def join_language_lists(engines_languages):
  59. language_list = {}
  60. for engine_name in engines_languages:
  61. for lang_code in engines_languages[engine_name]:
  62. # apply custom fixes if necessary
  63. if lang_code.lower() in locale_fixes:
  64. lang_code = locale_fixes[lang_code.lower()]
  65. locale = get_locale(lang_code)
  66. # ensure that lang_code uses standard language and country codes
  67. if locale and locale.territory:
  68. lang_code = locale.language + '-' + locale.territory
  69. # add locale if it's not in list
  70. if lang_code not in language_list:
  71. if locale:
  72. language_list[lang_code] = {'name': locale.get_language_name().title(),
  73. 'english_name': locale.english_name,
  74. 'country': locale.get_territory_name() or ''}
  75. # also add language without country
  76. if locale.language not in language_list:
  77. language_list[locale.language] = {'name': locale.get_language_name().title(),
  78. 'english_name': locale.english_name}
  79. else:
  80. language_list[lang_code] = {}
  81. # count engine for both language_country combination and language alone
  82. add_engine_counter(lang_code, engine_name, language_list)
  83. add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
  84. return language_list
  85. # Filter language list so it only includes the most supported languages and countries.
  86. def filter_language_list(all_languages):
  87. min_supported_engines = 10
  88. main_engines = [engine_name for engine_name in engines.keys()
  89. if 'general' in engines[engine_name].categories and
  90. engines[engine_name].supported_languages and
  91. not engines[engine_name].disabled]
  92. # filter list to include only languages supported by most engines or all default general engines
  93. filtered_languages = {code: lang for code, lang
  94. in all_languages.items()
  95. if (len(lang.get('counter', [])) >= min_supported_engines or
  96. all(main_engine in lang.get('counter', [])
  97. for main_engine in main_engines))}
  98. return filtered_languages
  99. # Add country codes to languages without one and filter out language codes.
  100. def assign_country_codes(filtered_languages, all_languages):
  101. sorted_languages = sorted(all_languages,
  102. key=lambda lang: len(all_languages[lang].get('counter', [])),
  103. reverse=True)
  104. previous_lang = None
  105. previous_code = None
  106. countries = 0
  107. for current_code in sorted(filtered_languages):
  108. current_lang = current_code.split('-')[0]
  109. # count country codes per language
  110. if current_lang == previous_lang:
  111. countries += 1
  112. else:
  113. if previous_lang is not None:
  114. # if language has no single country code
  115. if countries == 0:
  116. # try to get country code with most supported engines
  117. for l in sorted_languages:
  118. l_parts = l.split('-')
  119. if len(l_parts) == 2 and l_parts[0] == previous_lang:
  120. filtered_languages[l] = all_languages[l]
  121. filtered_languages[l]['country'] = ''
  122. countries = 1
  123. break
  124. if countries == 0:
  125. # get most likely country code from babel
  126. subtags = get_global('likely_subtags').get(previous_lang)
  127. if subtags:
  128. subtag_parts = subtags.split('_')
  129. new_code = subtag_parts[0] + '-' + subtag_parts[-1]
  130. filtered_languages[new_code] = all_languages[previous_lang]
  131. countries = 1
  132. if countries == 1:
  133. # remove countryless version of language if there's only one country
  134. del filtered_languages[previous_lang]
  135. if previous_code in filtered_languages:
  136. filtered_languages[previous_code]['country'] = ''
  137. countries = 0
  138. previous_lang = current_lang
  139. previous_code = current_code
  140. # Write languages.py.
  141. def write_languages_file(languages):
  142. new_file = open(languages_file, 'wb')
  143. file_content = '# -*- coding: utf-8 -*-\n'\
  144. + '# list of language codes\n'\
  145. + '# this file is generated automatically by utils/update_search_languages.py\n'\
  146. + '\nlanguage_codes = ('
  147. for code in sorted(languages):
  148. file_content += '\n (u"' + code + '"'\
  149. + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
  150. + ', u"' + languages[code].get('country', '') + '"'\
  151. + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
  152. # remove last comma
  153. file_content = file_content[:-1]
  154. file_content += '\n)\n'
  155. new_file.write(file_content.encode('utf8'))
  156. new_file.close()
  157. if __name__ == "__main__":
  158. initialize_engines(settings['engines'])
  159. engines_languages = fetch_supported_languages()
  160. all_languages = join_language_lists(engines_languages)
  161. filtered_languages = filter_language_list(all_languages)
  162. assign_country_codes(filtered_languages, all_languages)
  163. write_languages_file(filtered_languages)