fetch_languages.py 6.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from intersecting each engine's supported languages.
  3. #
  4. # The country names are obtained from http://api.geonames.org which requires registering as a user.
  5. #
  6. # Output files (engines_languages.json and languages.py)
  7. # are written in current directory to avoid overwriting in case something goes wrong.
  8. from requests import get
  9. from lxml.html import fromstring
  10. from json import loads, dump
  11. import io
  12. from sys import path
  13. path.append('../searx') # noqa
  14. from searx import settings
  15. from searx.url_utils import urlencode
  16. from searx.engines import initialize_engines, engines
  17. # Geonames API for country names.
  18. geonames_user = '' # ADD USER NAME HERE
  19. country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
  20. # Output files.
  21. engines_languages_file = 'engines_languages.json'
  22. languages_file = 'languages.py'
  23. engines_languages = {}
  24. # To filter out invalid codes and dialects.
  25. def valid_code(lang_code):
  26. # filter invalid codes
  27. # sl-SL is technically not invalid, but still a mistake
  28. invalid_codes = ['sl-SL', 'wt-WT', 'jw']
  29. invalid_countries = ['UK', 'XA', 'XL']
  30. if lang_code[:2] == 'xx'\
  31. or lang_code in invalid_codes\
  32. or lang_code[-2:] in invalid_countries\
  33. or is_dialect(lang_code):
  34. return False
  35. return True
  36. # Language codes with any additional tags other than language and country.
  37. def is_dialect(lang_code):
  38. lang_code = lang_code.split('-')
  39. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  40. return True
  41. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  42. return True
  43. return False
  44. # Get country name in specified language.
  45. def get_country_name(locale):
  46. if geonames_user is '':
  47. return ''
  48. locale = locale.split('-')
  49. if len(locale) != 2:
  50. return ''
  51. url = country_names_url.format(parameters=urlencode({'lang': locale[0],
  52. 'country': locale[1],
  53. 'username': geonames_user}))
  54. response = get(url)
  55. json = loads(response.text)
  56. content = json.get('geonames', None)
  57. if content is None or len(content) != 1:
  58. print("No country name found for " + locale[0] + "-" + locale[1])
  59. return ''
  60. return content[0].get('countryName', '')
  61. # Fetchs supported languages for each engine and writes json file with those.
  62. def fetch_supported_languages():
  63. initialize_engines(settings['engines'])
  64. for engine_name in engines:
  65. if hasattr(engines[engine_name], 'fetch_supported_languages'):
  66. try:
  67. engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
  68. except Exception as e:
  69. print(e)
  70. # write json file
  71. with io.open(engines_languages_file, "w", encoding="utf-8") as f:
  72. dump(engines_languages, f, ensure_ascii=False)
  73. # Join all language lists.
  74. # Iterate all languages supported by each engine.
  75. def join_language_lists():
  76. global languages
  77. # include wikipedia first for more accurate language names
  78. languages = {code: lang for code, lang
  79. in engines_languages['wikipedia'].items()
  80. if valid_code(code)}
  81. for engine_name in engines_languages:
  82. for locale in engines_languages[engine_name]:
  83. if valid_code(locale):
  84. # if language is not on list or if it has no name yet
  85. if locale not in languages or not languages[locale].get('name'):
  86. if isinstance(engines_languages[engine_name], dict):
  87. languages[locale] = engines_languages[engine_name][locale]
  88. else:
  89. languages[locale] = {}
  90. # add to counter of engines that support given language
  91. lang = locale.split('-')[0]
  92. if lang in languages:
  93. if 'counter' not in languages[lang]:
  94. languages[lang]['counter'] = [engine_name]
  95. elif engine_name not in languages[lang]['counter']:
  96. languages[lang]['counter'].append(engine_name)
  97. # filter list to include only languages supported by most engines
  98. min_supported_engines = int(0.70 * len(engines_languages))
  99. languages = {code: lang for code, lang
  100. in languages.items()
  101. if len(lang.get('counter', [])) >= min_supported_engines or
  102. len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
  103. # get locales that have no name or country yet
  104. for locale in languages.keys():
  105. # try to get language names
  106. if not languages[locale].get('name'):
  107. name = languages.get(locale.split('-')[0], {}).get('name', None)
  108. if name:
  109. languages[locale]['name'] = name
  110. else:
  111. # filter out locales with no name
  112. del languages[locale]
  113. continue
  114. # try to get language name in english
  115. if not languages[locale].get('english_name'):
  116. languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
  117. # try to get country name
  118. if locale.find('-') > 0 and not languages[locale].get('country'):
  119. languages[locale]['country'] = get_country_name(locale) or ''
  120. # Remove countryless language if language is featured in only one country.
  121. def filter_single_country_languages():
  122. prev_lang = None
  123. prev_code = None
  124. for code in sorted(languages):
  125. lang = code.split('-')[0]
  126. if lang == prev_lang:
  127. countries += 1
  128. else:
  129. if prev_lang is not None and countries == 1:
  130. del languages[prev_lang]
  131. languages[prev_code]['country'] = ''
  132. countries = 0
  133. prev_lang = lang
  134. prev_code = code
  135. # Write languages.py.
  136. def write_languages_file():
  137. new_file = open(languages_file, 'wb')
  138. file_content = '# -*- coding: utf-8 -*-\n'\
  139. + '# list of language codes\n'\
  140. + '# this file is generated automatically by utils/update_search_languages.py\n'\
  141. + '\nlanguage_codes = ('
  142. for code in sorted(languages):
  143. file_content += '\n (u"' + code + '"'\
  144. + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
  145. + ', u"' + languages[code].get('country', '') + '"'\
  146. + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
  147. # remove last comma
  148. file_content = file_content[:-1]
  149. file_content += '\n)\n'
  150. new_file.write(file_content.encode('utf8'))
  151. new_file.close()
  152. if __name__ == "__main__":
  153. fetch_supported_languages()
  154. join_language_lists()
  155. filter_single_country_languages()
  156. write_languages_file()