Browse Source

Merge pull request #1252 from MarcAbonce/search-languages

[mod] Refactor engine's search language handling
Adam Tauber 6 years ago
parent
commit
283f6c9053
No account linked to committer's email
43 changed files with 420 additions and 312 deletions
  1. 1
    1
      searx/data/engines_languages.json
  2. 20
    1
      searx/engines/__init__.py
  3. 2
    2
      searx/engines/archlinux.py
  4. 4
    2
      searx/engines/bing.py
  5. 2
    21
      searx/engines/bing_images.py
  6. 3
    3
      searx/engines/bing_news.py
  7. 4
    3
      searx/engines/bing_videos.py
  8. 2
    1
      searx/engines/dailymotion.py
  9. 19
    30
      searx/engines/duckduckgo.py
  10. 4
    3
      searx/engines/duckduckgo_definitions.py
  11. 4
    1
      searx/engines/duckduckgo_images.py
  12. 15
    16
      searx/engines/google.py
  13. 4
    2
      searx/engines/google_news.py
  14. 3
    10
      searx/engines/qwant.py
  15. 3
    5
      searx/engines/swisscows.py
  16. 3
    2
      searx/engines/wikidata.py
  17. 2
    7
      searx/engines/wikipedia.py
  18. 12
    12
      searx/engines/yahoo.py
  19. 5
    2
      searx/engines/yahoo_news.py
  20. 16
    24
      searx/languages.py
  21. 0
    4
      searx/preferences.py
  22. 7
    3
      searx/query.py
  23. 2
    2
      searx/templates/oscar/preferences.html
  24. 61
    0
      searx/utils.py
  25. 15
    7
      searx/webapp.py
  26. 6
    1
      tests/unit/engines/test_archlinux.py
  27. 1
    0
      tests/unit/engines/test_bing.py
  28. 0
    1
      tests/unit/engines/test_bing_images.py
  29. 2
    1
      tests/unit/engines/test_bing_news.py
  30. 0
    1
      tests/unit/engines/test_bing_videos.py
  31. 2
    1
      tests/unit/engines/test_dailymotion.py
  32. 12
    6
      tests/unit/engines/test_duckduckgo.py
  33. 1
    0
      tests/unit/engines/test_duckduckgo_definitions.py
  34. 0
    1
      tests/unit/engines/test_duckduckgo_images.py
  35. 7
    0
      tests/unit/engines/test_google.py
  36. 1
    0
      tests/unit/engines/test_google_news.py
  37. 1
    1
      tests/unit/engines/test_qwant.py
  38. 1
    0
      tests/unit/engines/test_swisscows.py
  39. 1
    0
      tests/unit/engines/test_wikidata.py
  40. 14
    3
      tests/unit/engines/test_yahoo.py
  41. 2
    1
      tests/unit/engines/test_yahoo_news.py
  42. 25
    0
      tests/unit/test_utils.py
  43. 131
    131
      utils/fetch_languages.py

+ 1
- 1
searx/data/engines_languages.json
File diff suppressed because it is too large
View File


+ 20
- 1
searx/engines/__init__.py View File

20
 import threading
20
 import threading
21
 from os.path import realpath, dirname
21
 from os.path import realpath, dirname
22
 from io import open
22
 from io import open
23
+from babel.localedata import locale_identifiers
23
 from flask_babel import gettext
24
 from flask_babel import gettext
24
 from operator import itemgetter
25
 from operator import itemgetter
25
 from json import loads
26
 from json import loads
26
 from requests import get
27
 from requests import get
27
 from searx import settings
28
 from searx import settings
28
 from searx import logger
29
 from searx import logger
29
-from searx.utils import load_module
30
+from searx.utils import load_module, match_language
30
 
31
 
31
 
32
 
32
 logger = logger.getChild('engines')
33
 logger = logger.getChild('engines')
38
 categories = {'general': []}
39
 categories = {'general': []}
39
 
40
 
40
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
41
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
42
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
43
+               for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
41
 
44
 
42
 engine_shortcuts = {}
45
 engine_shortcuts = {}
43
 engine_default_args = {'paging': False,
46
 engine_default_args = {'paging': False,
97
     if engine_data['name'] in languages:
100
     if engine_data['name'] in languages:
98
         setattr(engine, 'supported_languages', languages[engine_data['name']])
101
         setattr(engine, 'supported_languages', languages[engine_data['name']])
99
 
102
 
103
+    # find custom aliases for non standard language codes
104
+    if hasattr(engine, 'supported_languages'):
105
+        if hasattr(engine, 'language_aliases'):
106
+            language_aliases = getattr(engine, 'language_aliases')
107
+        else:
108
+            language_aliases = {}
109
+
110
+        for engine_lang in getattr(engine, 'supported_languages'):
111
+            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
112
+            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
113
+               iso_lang not in getattr(engine, 'supported_languages'):
114
+                language_aliases[iso_lang] = engine_lang
115
+
116
+        if language_aliases:
117
+            setattr(engine, 'language_aliases', language_aliases)
118
+
100
     # assign language fetching method if auxiliary method exists
119
     # assign language fetching method if auxiliary method exists
101
     if hasattr(engine, '_fetch_supported_languages'):
120
     if hasattr(engine, '_fetch_supported_languages'):
102
         setattr(engine, 'fetch_supported_languages',
121
         setattr(engine, 'fetch_supported_languages',

+ 2
- 2
searx/engines/archlinux.py View File

99
 
99
 
100
 # do search-request
100
 # do search-request
101
 def request(query, params):
101
 def request(query, params):
102
-    # translate the locale (e.g. 'en_US') to language code ('en')
102
+    # translate the locale (e.g. 'en-US') to language code ('en')
103
     language = locale_to_lang_code(params['language'])
103
     language = locale_to_lang_code(params['language'])
104
 
104
 
105
     # if our language is hosted on the main site, we need to add its name
105
     # if our language is hosted on the main site, we need to add its name
106
     # to the query in order to narrow the results to that language
106
     # to the query in order to narrow the results to that language
107
     if language in main_langs:
107
     if language in main_langs:
108
-        query += '(' + main_langs[language] + ')'
108
+        query += b' (' + main_langs[language] + b')'
109
 
109
 
110
     # prepare the request parameters
110
     # prepare the request parameters
111
     query = urlencode({'search': query})
111
     query = urlencode({'search': query})

+ 4
- 2
searx/engines/bing.py View File

16
 from lxml import html
16
 from lxml import html
17
 from searx.engines.xpath import extract_text
17
 from searx.engines.xpath import extract_text
18
 from searx.url_utils import urlencode
18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19
 
20
 
20
 # engine dependent config
21
 # engine dependent config
21
 categories = ['general']
22
 categories = ['general']
22
 paging = True
23
 paging = True
23
 language_support = True
24
 language_support = True
24
 supported_languages_url = 'https://www.bing.com/account/general'
25
 supported_languages_url = 'https://www.bing.com/account/general'
26
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
25
 
27
 
26
 # search-url
28
 # search-url
27
 base_url = 'https://www.bing.com/'
29
 base_url = 'https://www.bing.com/'
32
 def request(query, params):
34
 def request(query, params):
33
     offset = (params['pageno'] - 1) * 10 + 1
35
     offset = (params['pageno'] - 1) * 10 + 1
34
 
36
 
35
-    lang = params['language'].split('-')[0].upper()
37
+    lang = match_language(params['language'], supported_languages, language_aliases)
36
 
38
 
37
-    query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
39
+    query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
38
 
40
 
39
     search_path = search_string.format(
41
     search_path = search_string.format(
40
         query=urlencode({'q': query}),
42
         query=urlencode({'q': query}),

+ 2
- 21
searx/engines/bing_images.py View File

19
 from json import loads
19
 from json import loads
20
 import re
20
 import re
21
 from searx.url_utils import urlencode
21
 from searx.url_utils import urlencode
22
+from searx.utils import match_language
22
 
23
 
23
 # engine dependent config
24
 # engine dependent config
24
 categories = ['images']
25
 categories = ['images']
46
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
47
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
47
 
48
 
48
 
49
 
49
-# get supported region code
50
-def get_region_code(lang, lang_list=None):
51
-    region = None
52
-    if lang in (lang_list or supported_languages):
53
-        region = lang
54
-    elif lang.startswith('no'):
55
-        region = 'nb-NO'
56
-    else:
57
-        # try to get a supported country code with language
58
-        lang = lang.split('-')[0]
59
-        for lc in (lang_list or supported_languages):
60
-            if lang == lc.split('-')[0]:
61
-                region = lc
62
-                break
63
-    if region:
64
-        return region.lower()
65
-    else:
66
-        return 'en-us'
67
-
68
-
69
 # do search-request
50
 # do search-request
70
 def request(query, params):
51
 def request(query, params):
71
     offset = (params['pageno'] - 1) * 10 + 1
52
     offset = (params['pageno'] - 1) * 10 + 1
74
         query=urlencode({'q': query}),
55
         query=urlencode({'q': query}),
75
         offset=offset)
56
         offset=offset)
76
 
57
 
77
-    language = get_region_code(params['language'])
58
+    language = match_language(params['language'], supported_languages).lower()
78
 
59
 
79
     params['cookies']['SRCHHPGUSR'] = \
60
     params['cookies']['SRCHHPGUSR'] = \
80
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
61
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

+ 3
- 3
searx/engines/bing_news.py View File

14
 from datetime import datetime
14
 from datetime import datetime
15
 from dateutil import parser
15
 from dateutil import parser
16
 from lxml import etree
16
 from lxml import etree
17
-from searx.utils import list_get
18
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
17
+from searx.utils import list_get, match_language
18
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
19
 from searx.url_utils import urlencode, urlparse, parse_qsl
19
 from searx.url_utils import urlencode, urlparse, parse_qsl
20
 
20
 
21
 # engine dependent config
21
 # engine dependent config
71
 
71
 
72
     offset = (params['pageno'] - 1) * 10 + 1
72
     offset = (params['pageno'] - 1) * 10 + 1
73
 
73
 
74
-    language = params['language']
74
+    language = match_language(params['language'], supported_languages, language_aliases)
75
 
75
 
76
     params['url'] = _get_url(query, language, offset, params['time_range'])
76
     params['url'] = _get_url(query, language, offset, params['time_range'])
77
 
77
 

+ 4
- 3
searx/engines/bing_videos.py View File

12
 
12
 
13
 from json import loads
13
 from json import loads
14
 from lxml import html
14
 from lxml import html
15
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
15
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
16
 from searx.engines.xpath import extract_text
16
 from searx.engines.xpath import extract_text
17
 from searx.url_utils import urlencode
17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18
 
19
 
19
 
20
 
20
 categories = ['videos']
21
 categories = ['videos']
47
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
48
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
48
 
49
 
49
     # language cookie
50
     # language cookie
50
-    region = get_region_code(params['language'], lang_list=supported_languages)
51
-    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
51
+    language = match_language(params['language'], supported_languages).lower()
52
+    params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
52
 
53
 
53
     # query and paging
54
     # query and paging
54
     params['url'] = search_url.format(query=urlencode({'q': query}),
55
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 2
- 1
searx/engines/dailymotion.py View File

15
 from json import loads
15
 from json import loads
16
 from datetime import datetime
16
 from datetime import datetime
17
 from searx.url_utils import urlencode
17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18
 
19
 
19
 # engine dependent config
20
 # engine dependent config
20
 categories = ['videos']
21
 categories = ['videos']
32
 
33
 
33
 # do search-request
34
 # do search-request
34
 def request(query, params):
35
 def request(query, params):
35
-    locale = params['language']
36
+    locale = match_language(params['language'], supported_languages)
36
 
37
 
37
     params['url'] = search_url.format(
38
     params['url'] = search_url.format(
38
         query=urlencode({'search': query, 'localization': locale}),
39
         query=urlencode({'search': query, 'localization': locale}),

+ 19
- 30
searx/engines/duckduckgo.py View File

18
 from searx.engines.xpath import extract_text
18
 from searx.engines.xpath import extract_text
19
 from searx.poolrequests import get
19
 from searx.poolrequests import get
20
 from searx.url_utils import urlencode
20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
21
 
22
 
22
 # engine dependent config
23
 # engine dependent config
23
 categories = ['general']
24
 categories = ['general']
24
 paging = True
25
 paging = True
25
 language_support = True
26
 language_support = True
26
-supported_languages_url = 'https://duckduckgo.com/d2030.js'
27
+supported_languages_url = 'https://duckduckgo.com/util/u172.js'
27
 time_range_support = True
28
 time_range_support = True
28
 
29
 
30
+language_aliases = {
31
+    'ar-SA': 'ar-XA',
32
+    'es-419': 'es-XL',
33
+    'ja': 'jp-JP',
34
+    'ko': 'kr-KR',
35
+    'sl-SI': 'sl-SL',
36
+    'zh-TW': 'tzh-TW',
37
+    'zh-HK': 'tzh-HK'
38
+}
39
+
29
 # search-url
40
 # search-url
30
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
41
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
31
 time_range_url = '&df={range}'
42
 time_range_url = '&df={range}'
42
 
53
 
43
 
54
 
44
 # match query's language to a region code that duckduckgo will accept
55
 # match query's language to a region code that duckduckgo will accept
45
-def get_region_code(lang, lang_list=None):
46
-    # custom fixes for languages
47
-    if lang[:2] == 'ja':
48
-        region_code = 'jp-jp'
49
-    elif lang[:2] == 'sl':
50
-        region_code = 'sl-sl'
51
-    elif lang == 'zh-TW':
52
-        region_code = 'tw-tzh'
53
-    elif lang == 'zh-HK':
54
-        region_code = 'hk-tzh'
55
-    elif lang[-2:] == 'SA':
56
-        region_code = 'xa-' + lang.split('-')[0]
57
-    elif lang[-2:] == 'GB':
58
-        region_code = 'uk-' + lang.split('-')[0]
59
-    else:
60
-        region_code = lang.split('-')
61
-        if len(region_code) == 2:
62
-            # country code goes first
63
-            region_code = region_code[1].lower() + '-' + region_code[0].lower()
64
-        else:
65
-            # tries to get a country code from language
66
-            region_code = region_code[0].lower()
67
-            for lc in (lang_list or supported_languages):
68
-                lc = lc.split('-')
69
-                if region_code == lc[0]:
70
-                    region_code = lc[1].lower() + '-' + lc[0].lower()
71
-                    break
72
-    return region_code
56
+def get_region_code(lang, lang_list=[]):
57
+    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
58
+    lang_parts = lang_code.split('-')
59
+
60
+    # country code goes first
61
+    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
73
 
62
 
74
 
63
 
75
 # do search-request
64
 # do search-request
79
 
68
 
80
     offset = (params['pageno'] - 1) * 30
69
     offset = (params['pageno'] - 1) * 30
81
 
70
 
82
-    region_code = get_region_code(params['language'])
71
+    region_code = get_region_code(params['language'], supported_languages)
83
     params['url'] = url.format(
72
     params['url'] = url.format(
84
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
73
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
85
 
74
 

+ 4
- 3
searx/engines/duckduckgo_definitions.py View File

2
 from lxml import html
2
 from lxml import html
3
 from re import compile
3
 from re import compile
4
 from searx.engines.xpath import extract_text
4
 from searx.engines.xpath import extract_text
5
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
5
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
6
 from searx.url_utils import urlencode
6
 from searx.url_utils import urlencode
7
-from searx.utils import html_to_text
7
+from searx.utils import html_to_text, match_language
8
 
8
 
9
 url = 'https://api.duckduckgo.com/'\
9
 url = 'https://api.duckduckgo.com/'\
10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
24
 
24
 
25
 def request(query, params):
25
 def request(query, params):
26
     params['url'] = url.format(query=urlencode({'q': query}))
26
     params['url'] = url.format(query=urlencode({'q': query}))
27
-    params['headers']['Accept-Language'] = params['language'].split('-')[0]
27
+    language = match_language(params['language'], supported_languages, language_aliases)
28
+    params['headers']['Accept-Language'] = language.split('-')[0]
28
     return params
29
     return params
29
 
30
 
30
 
31
 

+ 4
- 1
searx/engines/duckduckgo_images.py View File

15
 
15
 
16
 from json import loads
16
 from json import loads
17
 from searx.engines.xpath import extract_text
17
 from searx.engines.xpath import extract_text
18
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
18
+from searx.engines.duckduckgo import (
19
+    _fetch_supported_languages, supported_languages_url,
20
+    get_region_code, language_aliases
21
+)
19
 from searx.poolrequests import get
22
 from searx.poolrequests import get
20
 from searx.url_utils import urlencode
23
 from searx.url_utils import urlencode
21
 
24
 

+ 15
- 16
searx/engines/google.py View File

14
 from searx.engines.xpath import extract_text, extract_url
14
 from searx.engines.xpath import extract_text, extract_url
15
 from searx import logger
15
 from searx import logger
16
 from searx.url_utils import urlencode, urlparse, parse_qsl
16
 from searx.url_utils import urlencode, urlparse, parse_qsl
17
+from searx.utils import match_language
17
 
18
 
18
 logger = logger.getChild('google engine')
19
 logger = logger.getChild('google engine')
19
 
20
 
72
     'RO': 'www.google.ro',  # Romania
73
     'RO': 'www.google.ro',  # Romania
73
     'RU': 'www.google.ru',  # Russia
74
     'RU': 'www.google.ru',  # Russia
74
     'SK': 'www.google.sk',  # Slovakia
75
     'SK': 'www.google.sk',  # Slovakia
75
-    'SL': 'www.google.si',  # Slovenia (SL -> si)
76
+    'SI': 'www.google.si',  # Slovenia
76
     'SE': 'www.google.se',  # Sweden
77
     'SE': 'www.google.se',  # Sweden
77
     'TH': 'www.google.co.th',  # Thailand
78
     'TH': 'www.google.co.th',  # Thailand
78
     'TR': 'www.google.com.tr',  # Turkey
79
     'TR': 'www.google.com.tr',  # Turkey
165
 def request(query, params):
166
 def request(query, params):
166
     offset = (params['pageno'] - 1) * 10
167
     offset = (params['pageno'] - 1) * 10
167
 
168
 
169
+    language = match_language(params['language'], supported_languages)
170
+    language_array = language.split('-')
171
+    if params['language'].find('-') > 0:
172
+        country = params['language'].split('-')[1]
173
+    elif len(language_array) == 2:
174
+        country = language_array[1]
175
+    else:
176
+        country = 'US'
177
+
168
     # temporary fix until a way of supporting en-US is found
178
     # temporary fix until a way of supporting en-US is found
169
-    if params['language'] == 'en-US':
170
-        params['language'] = 'en-GB'
179
+    if language == 'en-US':
180
+        country = 'GB'
171
 
181
 
172
-    if params['language'][:2] == 'jv':
173
-        language = 'jw'
174
-        country = 'ID'
175
-        url_lang = 'lang_jw'
176
-    else:
177
-        language_array = params['language'].lower().split('-')
178
-        if len(language_array) == 2:
179
-            country = language_array[1]
180
-        else:
181
-            country = 'US'
182
-        language = language_array[0] + ',' + language_array[0] + '-' + country
183
-        url_lang = 'lang_' + language_array[0]
182
+    url_lang = 'lang_' + language
184
 
183
 
185
     if use_locale_domain:
184
     if use_locale_domain:
186
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
185
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
196
     if params['time_range'] in time_range_dict:
195
     if params['time_range'] in time_range_dict:
197
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
196
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
198
 
197
 
199
-    params['headers']['Accept-Language'] = language
198
+    params['headers']['Accept-Language'] = language + ',' + language + '-' + country
200
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
199
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
201
 
200
 
202
     params['google_hostname'] = google_hostname
201
     params['google_hostname'] = google_hostname

+ 4
- 2
searx/engines/google_news.py View File

13
 from lxml import html
13
 from lxml import html
14
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
14
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
15
 from searx.url_utils import urlencode
15
 from searx.url_utils import urlencode
16
+from searx.utils import match_language
16
 
17
 
17
 # search-url
18
 # search-url
18
 categories = ['news']
19
 categories = ['news']
50
     params['url'] = search_url.format(query=urlencode({'q': query}),
51
     params['url'] = search_url.format(query=urlencode({'q': query}),
51
                                       search_options=urlencode(search_options))
52
                                       search_options=urlencode(search_options))
52
 
53
 
53
-    language_array = params['language'].lower().split('-')
54
-    params['url'] += '&lr=lang_' + language_array[0]
54
+    language = match_language(params['language'], supported_languages).split('-')[0]
55
+    if language:
56
+        params['url'] += '&lr=lang_' + language
55
 
57
 
56
     return params
58
     return params
57
 
59
 

+ 3
- 10
searx/engines/qwant.py View File

14
 from json import loads
14
 from json import loads
15
 from searx.utils import html_to_text
15
 from searx.utils import html_to_text
16
 from searx.url_utils import urlencode
16
 from searx.url_utils import urlencode
17
+from searx.utils import match_language
17
 
18
 
18
 # engine dependent config
19
 # engine dependent config
19
 categories = None
20
 categories = None
45
                                    offset=offset)
46
                                    offset=offset)
46
 
47
 
47
     # add language tag
48
     # add language tag
48
-    if params['language'] == 'no' or params['language'].startswith('no-'):
49
-        params['language'] = params['language'].replace('no', 'nb', 1)
50
-    if params['language'].find('-') < 0:
51
-        # tries to get a country code from language
52
-        for lang in supported_languages:
53
-            lc = lang.split('-')
54
-            if params['language'] == lc[0]:
55
-                params['language'] = lang
56
-                break
57
-    params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
49
+    language = match_language(params['language'], supported_languages)
50
+    params['url'] += '&locale=' + language.replace('-', '_').lower()
58
 
51
 
59
     return params
52
     return params
60
 
53
 

+ 3
- 5
searx/engines/swisscows.py View File

14
 import re
14
 import re
15
 from lxml.html import fromstring
15
 from lxml.html import fromstring
16
 from searx.url_utils import unquote, urlencode
16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17
 
18
 
18
 # engine dependent config
19
 # engine dependent config
19
 categories = ['general', 'images']
20
 categories = ['general', 'images']
35
 
36
 
36
 # do search-request
37
 # do search-request
37
 def request(query, params):
38
 def request(query, params):
38
-    if params['language'].split('-')[0] == 'no':
39
-        region = 'nb-NO'
40
-    else:
41
-        region = params['language']
42
-        ui_language = params['language'].split('-')[0]
39
+    region = match_language(params['language'], supported_languages)
40
+    ui_language = region.split('-')[0]
43
 
41
 
44
     search_path = search_string.format(
42
     search_path = search_string.format(
45
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),
43
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

+ 3
- 2
searx/engines/wikidata.py View File

16
 from searx.engines.xpath import extract_text
16
 from searx.engines.xpath import extract_text
17
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
17
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
18
 from searx.url_utils import urlencode
18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19
 
20
 
20
 from json import loads
21
 from json import loads
21
 from lxml.html import fromstring
22
 from lxml.html import fromstring
56
 
57
 
57
 
58
 
58
 def request(query, params):
59
 def request(query, params):
59
-    language = params['language'].split('-')[0]
60
+    language = match_language(params['language'], supported_languages).split('-')[0]
60
 
61
 
61
     params['url'] = url_search.format(
62
     params['url'] = url_search.format(
62
         query=urlencode({'label': query, 'language': language}))
63
         query=urlencode({'label': query, 'language': language}))
68
     html = fromstring(resp.text)
69
     html = fromstring(resp.text)
69
     wikidata_ids = html.xpath(wikidata_ids_xpath)
70
     wikidata_ids = html.xpath(wikidata_ids_xpath)
70
 
71
 
71
-    language = resp.search_params['language'].split('-')[0]
72
+    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
72
 
73
 
73
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
74
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
74
     for wikidata_id in wikidata_ids[:result_count]:
75
     for wikidata_id in wikidata_ids[:result_count]:

+ 2
- 7
searx/engines/wikipedia.py View File

13
 from json import loads
13
 from json import loads
14
 from lxml.html import fromstring
14
 from lxml.html import fromstring
15
 from searx.url_utils import quote, urlencode
15
 from searx.url_utils import quote, urlencode
16
+from searx.utils import match_language
16
 
17
 
17
 # search-url
18
 # search-url
18
 base_url = u'https://{language}.wikipedia.org/'
19
 base_url = u'https://{language}.wikipedia.org/'
30
 
31
 
31
 # set language in base_url
32
 # set language in base_url
32
 def url_lang(lang):
33
 def url_lang(lang):
33
-    lang = lang.split('-')[0]
34
-    if lang not in supported_languages:
35
-        language = 'en'
36
-    else:
37
-        language = lang
38
-
39
-    return language
34
+    return match_language(lang, supported_languages).split('-')[0]
40
 
35
 
41
 
36
 
42
 # do search-request
37
 # do search-request

+ 12
- 12
searx/engines/yahoo.py View File

14
 from lxml import html
14
 from lxml import html
15
 from searx.engines.xpath import extract_text, extract_url
15
 from searx.engines.xpath import extract_text, extract_url
16
 from searx.url_utils import unquote, urlencode
16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17
 
18
 
18
 # engine dependent config
19
 # engine dependent config
19
 categories = ['general']
20
 categories = ['general']
39
                    'week': ['1w', 'w'],
40
                    'week': ['1w', 'w'],
40
                    'month': ['1m', 'm']}
41
                    'month': ['1m', 'm']}
41
 
42
 
43
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
44
+
42
 
45
 
43
 # remove yahoo-specific tracking-url
46
 # remove yahoo-specific tracking-url
44
 def parse_url(url_string):
47
 def parse_url(url_string):
70
                                         lang=language)
73
                                         lang=language)
71
 
74
 
72
 
75
 
73
-def _get_language(params):
74
-    if params['language'][:2] == 'zh':
75
-        if params['language'] == 'zh' or params['language'] == 'zh-CH':
76
-            return 'szh'
77
-        else:
78
-            return 'tzh'
79
-    else:
80
-        return params['language'].split('-')[0]
81
-
82
-
83
 # do search-request
76
 # do search-request
84
 def request(query, params):
77
 def request(query, params):
85
     if params['time_range'] and params['time_range'] not in time_range_dict:
78
     if params['time_range'] and params['time_range'] not in time_range_dict:
86
         return params
79
         return params
87
 
80
 
88
     offset = (params['pageno'] - 1) * 10 + 1
81
     offset = (params['pageno'] - 1) * 10 + 1
89
-    language = _get_language(params)
82
+    language = match_language(params['language'], supported_languages, language_aliases)
83
+    if language not in language_aliases.values():
84
+        language = language.split('-')[0]
85
+    language = language.replace('-', '_').lower()
90
 
86
 
91
     params['url'] = _get_url(query, offset, language, params['time_range'])
87
     params['url'] = _get_url(query, offset, language, params['time_range'])
92
 
88
 
145
     dom = html.fromstring(resp.text)
141
     dom = html.fromstring(resp.text)
146
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
142
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
147
     for option in options:
143
     for option in options:
148
-        code = option.xpath('./@value')[0][5:].replace('_', '-')
144
+        code_parts = option.xpath('./@value')[0][5:].split('_')
145
+        if len(code_parts) == 2:
146
+            code = code_parts[0] + '-' + code_parts[1].upper()
147
+        else:
148
+            code = code_parts[0]
149
         supported_languages.append(code)
149
         supported_languages.append(code)
150
 
150
 
151
     return supported_languages
151
     return supported_languages

+ 5
- 2
searx/engines/yahoo_news.py View File

13
 from datetime import datetime, timedelta
13
 from datetime import datetime, timedelta
14
 from lxml import html
14
 from lxml import html
15
 from searx.engines.xpath import extract_text, extract_url
15
 from searx.engines.xpath import extract_text, extract_url
16
-from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
16
+from searx.engines.yahoo import (
17
+    parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
18
+)
17
 from dateutil import parser
19
 from dateutil import parser
18
 from searx.url_utils import urlencode
20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
19
 
22
 
20
 # engine dependent config
23
 # engine dependent config
21
 categories = ['news']
24
 categories = ['news']
38
 def request(query, params):
41
 def request(query, params):
39
     offset = (params['pageno'] - 1) * 10 + 1
42
     offset = (params['pageno'] - 1) * 10 + 1
40
 
43
 
41
-    language = params['language'].split('-')[0]
44
+    language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
42
 
45
 
43
     params['url'] = search_url.format(offset=offset,
46
     params['url'] = search_url.format(offset=offset,
44
                                       query=urlencode({'p': query}),
47
                                       query=urlencode({'p': query}),

+ 16
- 24
searx/languages.py View File

5
 language_codes = (
5
 language_codes = (
6
     (u"ar-SA", u"العربية", u"", u"Arabic"),
6
     (u"ar-SA", u"العربية", u"", u"Arabic"),
7
     (u"bg-BG", u"Български", u"", u"Bulgarian"),
7
     (u"bg-BG", u"Български", u"", u"Bulgarian"),
8
-    (u"ca", u"Català", u"", u"Catalan"),
9
-    (u"ca-AD", u"Català", u"Andorra", u"Catalan"),
10
-    (u"ca-CT", u"Català", u"", u"Catalan"),
11
-    (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
12
-    (u"ca-FR", u"Català", u"França", u"Catalan"),
8
+    (u"ca-ES", u"Català", u"", u"Catalan"),
13
     (u"cs-CZ", u"Čeština", u"", u"Czech"),
9
     (u"cs-CZ", u"Čeština", u"", u"Czech"),
14
     (u"da-DK", u"Dansk", u"", u"Danish"),
10
     (u"da-DK", u"Dansk", u"", u"Danish"),
15
     (u"de", u"Deutsch", u"", u"German"),
11
     (u"de", u"Deutsch", u"", u"German"),
21
     (u"en-AU", u"English", u"Australia", u"English"),
17
     (u"en-AU", u"English", u"Australia", u"English"),
22
     (u"en-CA", u"English", u"Canada", u"English"),
18
     (u"en-CA", u"English", u"Canada", u"English"),
23
     (u"en-GB", u"English", u"United Kingdom", u"English"),
19
     (u"en-GB", u"English", u"United Kingdom", u"English"),
24
-    (u"en-ID", u"English", u"Indonesia", u"English"),
25
-    (u"en-IE", u"English", u"Ireland", u"English"),
26
     (u"en-IN", u"English", u"India", u"English"),
20
     (u"en-IN", u"English", u"India", u"English"),
27
     (u"en-MY", u"English", u"Malaysia", u"English"),
21
     (u"en-MY", u"English", u"Malaysia", u"English"),
28
-    (u"en-NZ", u"English", u"New Zealand", u"English"),
29
-    (u"en-PH", u"English", u"Philippines", u"English"),
30
-    (u"en-SG", u"English", u"Singapore", u"English"),
31
     (u"en-US", u"English", u"United States", u"English"),
22
     (u"en-US", u"English", u"United States", u"English"),
32
-    (u"en-ZA", u"English", u"South Africa", u"English"),
33
     (u"es", u"Español", u"", u"Spanish"),
23
     (u"es", u"Español", u"", u"Spanish"),
34
-    (u"es-AD", u"Español", u"Andorra", u"Spanish"),
35
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
24
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
36
-    (u"es-CL", u"Español", u"Chile", u"Spanish"),
37
-    (u"es-CO", u"Español", u"Colombia", u"Spanish"),
38
     (u"es-ES", u"Español", u"España", u"Spanish"),
25
     (u"es-ES", u"Español", u"España", u"Spanish"),
39
     (u"es-MX", u"Español", u"México", u"Spanish"),
26
     (u"es-MX", u"Español", u"México", u"Spanish"),
40
-    (u"es-PE", u"Español", u"Perú", u"Spanish"),
41
-    (u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
42
     (u"et-EE", u"Eesti", u"", u"Estonian"),
27
     (u"et-EE", u"Eesti", u"", u"Estonian"),
28
+    (u"fa-IR", u"فارسی", u"", u"Persian"),
43
     (u"fi-FI", u"Suomi", u"", u"Finnish"),
29
     (u"fi-FI", u"Suomi", u"", u"Finnish"),
44
     (u"fr", u"Français", u"", u"French"),
30
     (u"fr", u"Français", u"", u"French"),
45
-    (u"fr-AD", u"Français", u"Andorre", u"French"),
46
     (u"fr-BE", u"Français", u"Belgique", u"French"),
31
     (u"fr-BE", u"Français", u"Belgique", u"French"),
47
     (u"fr-CA", u"Français", u"Canada", u"French"),
32
     (u"fr-CA", u"Français", u"Canada", u"French"),
48
     (u"fr-CH", u"Français", u"Suisse", u"French"),
33
     (u"fr-CH", u"Français", u"Suisse", u"French"),
49
     (u"fr-FR", u"Français", u"France", u"French"),
34
     (u"fr-FR", u"Français", u"France", u"French"),
50
     (u"he-IL", u"עברית", u"", u"Hebrew"),
35
     (u"he-IL", u"עברית", u"", u"Hebrew"),
36
+    (u"hr-HR", u"Hrvatski", u"", u"Croatian"),
51
     (u"hu-HU", u"Magyar", u"", u"Hungarian"),
37
     (u"hu-HU", u"Magyar", u"", u"Hungarian"),
52
-    (u"it", u"Italiano", u"", u"Italian"),
53
-    (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
54
-    (u"it-IT", u"Italiano", u"Italia", u"Italian"),
38
+    (u"id-ID", u"Indonesia", u"", u"Indonesian"),
39
+    (u"is-IS", u"Íslenska", u"", u"Icelandic"),
40
+    (u"it-IT", u"Italiano", u"", u"Italian"),
55
     (u"ja-JP", u"日本語", u"", u"Japanese"),
41
     (u"ja-JP", u"日本語", u"", u"Japanese"),
56
     (u"ko-KR", u"한국어", u"", u"Korean"),
42
     (u"ko-KR", u"한국어", u"", u"Korean"),
43
+    (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
44
+    (u"lv-LV", u"Latviešu", u"", u"Latvian"),
45
+    (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
46
+    (u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
57
     (u"nl", u"Nederlands", u"", u"Dutch"),
47
     (u"nl", u"Nederlands", u"", u"Dutch"),
58
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
48
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
59
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
49
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
60
-    (u"no-NO", u"Norsk", u"", u"Norwegian"),
61
     (u"pl-PL", u"Polski", u"", u"Polish"),
50
     (u"pl-PL", u"Polski", u"", u"Polish"),
62
     (u"pt", u"Português", u"", u"Portuguese"),
51
     (u"pt", u"Português", u"", u"Portuguese"),
63
-    (u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
64
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
52
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
65
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
53
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
66
     (u"ro-RO", u"Română", u"", u"Romanian"),
54
     (u"ro-RO", u"Română", u"", u"Romanian"),
67
     (u"ru-RU", u"Русский", u"", u"Russian"),
55
     (u"ru-RU", u"Русский", u"", u"Russian"),
56
+    (u"sk-SK", u"Slovenčina", u"", u"Slovak"),
57
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
58
+    (u"sr-RS", u"Српски", u"", u"Serbian"),
68
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
59
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
69
     (u"th-TH", u"ไทย", u"", u"Thai"),
60
     (u"th-TH", u"ไทย", u"", u"Thai"),
70
     (u"tr-TR", u"Türkçe", u"", u"Turkish"),
61
     (u"tr-TR", u"Türkçe", u"", u"Turkish"),
62
+    (u"uk-UA", u"Українська", u"", u"Ukrainian"),
63
+    (u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
71
     (u"zh", u"中文", u"", u"Chinese"),
64
     (u"zh", u"中文", u"", u"Chinese"),
72
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
65
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
73
-    (u"zh-HK", u"中文", u"香港", u"Chinese"),
74
-    (u"zh-TW", u"中文", u"台湾", u"Chinese")
66
+    (u"zh-TW", u"中文", u"台灣", u"Chinese")
75
 )
67
 )

+ 0
- 4
searx/preferences.py View File

115
                 pass
115
                 pass
116
             elif lang in self.choices:
116
             elif lang in self.choices:
117
                 data = lang
117
                 data = lang
118
-            elif data == 'nb-NO':
119
-                data = 'no-NO'
120
-            elif data == 'ar-XA':
121
-                data = 'ar-SA'
122
             else:
118
             else:
123
                 data = self.value
119
                 data = self.value
124
         self.value = data
120
         self.value = data

+ 7
- 3
searx/query.py View File

96
                                 break
96
                                 break
97
 
97
 
98
                 # user may set a valid, yet not selectable language
98
                 # user may set a valid, yet not selectable language
99
-                if not self.languages and VALID_LANGUAGE_CODE.match(lang):
100
-                    self.languages.append(lang)
101
-                    parse_next = True
99
+                if VALID_LANGUAGE_CODE.match(lang):
100
+                    lang_parts = lang.split('-')
101
+                    if len(lang_parts) > 1:
102
+                        lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
103
+                    if lang not in self.languages:
104
+                        self.languages.append(lang)
105
+                        parse_next = True
102
 
106
 
103
             # this force a engine or category
107
             # this force a engine or category
104
             if query_part[0] == '!' or query_part[0] == '?':
108
             if query_part[0] == '!' or query_part[0] == '?':

+ 2
- 2
searx/templates/oscar/preferences.html View File

187
                                     </td>
187
                                     </td>
188
                                     <th>{{ search_engine.name }}</th>
188
                                     <th>{{ search_engine.name }}</th>
189
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
189
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
190
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
190
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
191
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
191
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
192
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
192
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
193
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
193
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
197
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
197
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
198
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
198
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
199
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
199
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
200
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
200
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
201
 					<td>{{ shortcuts[search_engine.name] }}</td>
201
 					<td>{{ shortcuts[search_engine.name] }}</td>
202
                                     <th>{{ search_engine.name }}</th>
202
                                     <th>{{ search_engine.name }}</th>
203
                                     <td class="onoff-checkbox">
203
                                     <td class="onoff-checkbox">

+ 61
- 0
searx/utils.py View File

4
 import os
4
 import os
5
 import re
5
 import re
6
 
6
 
7
+from babel.core import get_global
7
 from babel.dates import format_date
8
 from babel.dates import format_date
8
 from codecs import getincrementalencoder
9
 from codecs import getincrementalencoder
9
 from imp import load_source
10
 from imp import load_source
12
 from random import choice
13
 from random import choice
13
 import sys
14
 import sys
14
 
15
 
16
+from searx import settings
15
 from searx.version import VERSION_STRING
17
 from searx.version import VERSION_STRING
16
 from searx.languages import language_codes
18
 from searx.languages import language_codes
17
 from searx import settings
19
 from searx import settings
322
         return False
324
         return False
323
 
325
 
324
 
326
 
327
+# auxiliary function to match lang_code in lang_list
328
+def _match_language(lang_code, lang_list=[], custom_aliases={}):
329
+    # replace language code with a custom alias if necessary
330
+    if lang_code in custom_aliases:
331
+        lang_code = custom_aliases[lang_code]
332
+
333
+    if lang_code in lang_list:
334
+        return lang_code
335
+
336
+    # try to get the most likely country for this language
337
+    subtags = get_global('likely_subtags').get(lang_code)
338
+    if subtags:
339
+        subtag_parts = subtags.split('_')
340
+        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
341
+        if new_code in custom_aliases:
342
+            new_code = custom_aliases[new_code]
343
+        if new_code in lang_list:
344
+            return new_code
345
+
346
+    # try to get the any supported country for this language
347
+    for lc in lang_list:
348
+        if lang_code == lc.split('-')[0]:
349
+            return lc
350
+
351
+    return None
352
+
353
+
354
+# get the language code from lang_list that best matches locale_code
355
+def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
356
+    # try to get language from given locale_code
357
+    language = _match_language(locale_code, lang_list, custom_aliases)
358
+    if language:
359
+        return language
360
+
361
+    locale_parts = locale_code.split('-')
362
+    lang_code = locale_parts[0]
363
+
364
+    # try to get language using an equivalent country code
365
+    if len(locale_parts) > 1:
366
+        country_alias = get_global('territory_aliases').get(locale_parts[-1])
367
+        if country_alias:
368
+            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
369
+            if language:
370
+                return language
371
+
372
+    # try to get language using an equivalent language code
373
+    alias = get_global('language_aliases').get(lang_code)
374
+    if alias:
375
+        language = _match_language(alias, lang_list, custom_aliases)
376
+        if language:
377
+            return language
378
+
379
+    if lang_code != locale_code:
380
+        # try to get language from given language without giving the country
381
+        language = _match_language(lang_code, lang_list, custom_aliases)
382
+
383
+    return language or fallback
384
+
385
+
325
 def load_module(filename, module_dir):
386
 def load_module(filename, module_dir):
326
     modname = splitext(filename)[0]
387
     modname = splitext(filename)[0]
327
     if modname in sys.modules:
388
     if modname in sys.modules:

+ 15
- 7
searx/webapp.py View File

58
 from searx.utils import (
58
 from searx.utils import (
59
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
59
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
60
     get_static_files, get_result_templates, get_themes, gen_useragent,
60
     get_static_files, get_result_templates, get_themes, gen_useragent,
61
-    dict_subset, prettify_url
61
+    dict_subset, prettify_url, match_language
62
 )
62
 )
63
 from searx.version import VERSION_STRING
63
 from searx.version import VERSION_STRING
64
-from searx.languages import language_codes
64
+from searx.languages import language_codes as languages
65
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
65
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
66
 from searx.query import RawTextQuery
66
 from searx.query import RawTextQuery
67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
68
 from searx.plugins import plugins
68
 from searx.plugins import plugins
69
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
69
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
70
-from searx.preferences import Preferences, ValidationException
70
+from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
71
 from searx.answerers import answerers
71
 from searx.answerers import answerers
72
 from searx.url_utils import urlencode, urlparse, urljoin
72
 from searx.url_utils import urlencode, urlparse, urljoin
73
 from searx.utils import new_hmac
73
 from searx.utils import new_hmac
133
 babel = Babel(app)
133
 babel = Babel(app)
134
 
134
 
135
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
135
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
136
-               'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
136
+               'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
137
 
137
 
138
 # used when translating category names
138
 # used when translating category names
139
 _category_names = (gettext('files'),
139
 _category_names = (gettext('files'),
352
 
352
 
353
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
353
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
354
 
354
 
355
-    kwargs['language_codes'] = language_codes
355
+    kwargs['language_codes'] = languages
356
     if 'current_language' not in kwargs:
356
     if 'current_language' not in kwargs:
357
-        kwargs['current_language'] = request.preferences.get_value('language')
357
+        kwargs['current_language'] = match_language(request.preferences.get_value('language'),
358
+                                                    LANGUAGE_CODES,
359
+                                                    fallback=settings['search']['language'])
358
 
360
 
359
     # override url_for function in templates
361
     # override url_for function in templates
360
     kwargs['url_for'] = url_for_theme
362
     kwargs['url_for'] = url_for_theme
590
         infoboxes=result_container.infoboxes,
592
         infoboxes=result_container.infoboxes,
591
         paging=result_container.paging,
593
         paging=result_container.paging,
592
         unresponsive_engines=result_container.unresponsive_engines,
594
         unresponsive_engines=result_container.unresponsive_engines,
593
-        current_language=search_query.lang,
595
+        current_language=match_language(search_query.lang,
596
+                                        LANGUAGE_CODES,
597
+                                        fallback=settings['search']['language']),
594
         base_url=get_base_url(),
598
         base_url=get_base_url(),
595
         theme=get_current_theme_name(),
599
         theme=get_current_theme_name(),
596
         favicons=global_favicons[themes.index(get_current_theme_name())]
600
         favicons=global_favicons[themes.index(get_current_theme_name())]
687
                              'warn_time': False}
691
                              'warn_time': False}
688
             if e.timeout > settings['outgoing']['request_timeout']:
692
             if e.timeout > settings['outgoing']['request_timeout']:
689
                 stats[e.name]['warn_timeout'] = True
693
                 stats[e.name]['warn_timeout'] = True
694
+            if match_language(request.preferences.get_value('language'),
695
+                              getattr(e, 'supported_languages', []),
696
+                              getattr(e, 'language_aliases', {}), None):
697
+                stats[e.name]['supports_selected_language'] = True
690
 
698
 
691
     # get first element [0], the engine time,
699
     # get first element [0], the engine time,
692
     # and then the second element [1] : the time (the first one is the label)
700
     # and then the second element [1] : the time (the first one is the label)

+ 6
- 1
tests/unit/engines/test_archlinux.py View File

19
         query = 'test_query'
19
         query = 'test_query'
20
         dic = defaultdict(dict)
20
         dic = defaultdict(dict)
21
         dic['pageno'] = 1
21
         dic['pageno'] = 1
22
-        dic['language'] = 'en_US'
22
+        dic['language'] = 'en-US'
23
         params = archlinux.request(query, dic)
23
         params = archlinux.request(query, dic)
24
         self.assertTrue('url' in params)
24
         self.assertTrue('url' in params)
25
         self.assertTrue(query in params['url'])
25
         self.assertTrue(query in params['url'])
26
         self.assertTrue('wiki.archlinux.org' in params['url'])
26
         self.assertTrue('wiki.archlinux.org' in params['url'])
27
 
27
 
28
+        for lang, name in archlinux.main_langs:
29
+            dic['language'] = lang
30
+            params = archlinux.request(query, dic)
31
+            self.assertTrue(name in params['url'])
32
+
28
         for lang, domain in domains.items():
33
         for lang, domain in domains.items():
29
             dic['language'] = lang
34
             dic['language'] = lang
30
             params = archlinux.request(query, dic)
35
             params = archlinux.request(query, dic)

+ 1
- 0
tests/unit/engines/test_bing.py View File

7
 class TestBingEngine(SearxTestCase):
7
 class TestBingEngine(SearxTestCase):
8
 
8
 
9
     def test_request(self):
9
     def test_request(self):
10
+        bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
10
         query = u'test_query'
11
         query = u'test_query'
11
         dicto = defaultdict(dict)
12
         dicto = defaultdict(dict)
12
         dicto['pageno'] = 0
13
         dicto['pageno'] = 0

+ 0
- 1
tests/unit/engines/test_bing_images.py View File

9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
         bing_images.supported_languages = ['fr-FR', 'en-US']
11
         bing_images.supported_languages = ['fr-FR', 'en-US']
12
-
13
         query = 'test_query'
12
         query = 'test_query'
14
         dicto = defaultdict(dict)
13
         dicto = defaultdict(dict)
15
         dicto['pageno'] = 1
14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_bing_news.py View File

8
 class TestBingNewsEngine(SearxTestCase):
8
 class TestBingNewsEngine(SearxTestCase):
9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
+        bing_news.supported_languages = ['en', 'fr']
11
         query = 'test_query'
12
         query = 'test_query'
12
         dicto = defaultdict(dict)
13
         dicto = defaultdict(dict)
13
         dicto['pageno'] = 1
14
         dicto['pageno'] = 1
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15
         dicto['time_range'] = ''
16
         dicto['time_range'] = ''
16
         params = bing_news.request(query, dicto)
17
         params = bing_news.request(query, dicto)
17
         self.assertIn('url', params)
18
         self.assertIn('url', params)

+ 0
- 1
tests/unit/engines/test_bing_videos.py View File

9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
         bing_videos.supported_languages = ['fr-FR', 'en-US']
11
         bing_videos.supported_languages = ['fr-FR', 'en-US']
12
-
13
         query = 'test_query'
12
         query = 'test_query'
14
         dicto = defaultdict(dict)
13
         dicto = defaultdict(dict)
15
         dicto['pageno'] = 1
14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_dailymotion.py View File

8
 class TestDailymotionEngine(SearxTestCase):
8
 class TestDailymotionEngine(SearxTestCase):
9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
+        dailymotion.supported_languages = ['en', 'fr']
11
         query = 'test_query'
12
         query = 'test_query'
12
         dicto = defaultdict(dict)
13
         dicto = defaultdict(dict)
13
         dicto['pageno'] = 0
14
         dicto['pageno'] = 0
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15
         params = dailymotion.request(query, dicto)
16
         params = dailymotion.request(query, dicto)
16
         self.assertTrue('url' in params)
17
         self.assertTrue('url' in params)
17
         self.assertTrue(query in params['url'])
18
         self.assertTrue(query in params['url'])

+ 12
- 6
tests/unit/engines/test_duckduckgo.py View File

1
 # -*- coding: utf-8 -*-
1
 # -*- coding: utf-8 -*-
2
 from collections import defaultdict
2
 from collections import defaultdict
3
 import mock
3
 import mock
4
-from searx.engines import duckduckgo
4
+from searx.engines import load_engine, duckduckgo
5
 from searx.testing import SearxTestCase
5
 from searx.testing import SearxTestCase
6
 
6
 
7
 
7
 
8
 class TestDuckduckgoEngine(SearxTestCase):
8
 class TestDuckduckgoEngine(SearxTestCase):
9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
+        duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
12
+
11
         query = 'test_query'
13
         query = 'test_query'
12
         dicto = defaultdict(dict)
14
         dicto = defaultdict(dict)
13
         dicto['pageno'] = 1
15
         dicto['pageno'] = 1
14
-        dicto['language'] = 'de-CH'
15
         dicto['time_range'] = ''
16
         dicto['time_range'] = ''
17
+
18
+        dicto['language'] = 'de-CH'
16
         params = duckduckgo.request(query, dicto)
19
         params = duckduckgo.request(query, dicto)
17
         self.assertIn('url', params)
20
         self.assertIn('url', params)
18
         self.assertIn(query, params['url'])
21
         self.assertIn(query, params['url'])
20
         self.assertIn('ch-de', params['url'])
23
         self.assertIn('ch-de', params['url'])
21
         self.assertIn('s=0', params['url'])
24
         self.assertIn('s=0', params['url'])
22
 
25
 
23
-        # when ddg uses non standard code
26
+        # when ddg uses non standard codes
27
+        dicto['language'] = 'zh-HK'
28
+        params = duckduckgo.request(query, dicto)
29
+        self.assertIn('hk-tzh', params['url'])
30
+
24
         dicto['language'] = 'en-GB'
31
         dicto['language'] = 'en-GB'
25
         params = duckduckgo.request(query, dicto)
32
         params = duckduckgo.request(query, dicto)
26
         self.assertIn('uk-en', params['url'])
33
         self.assertIn('uk-en', params['url'])
27
 
34
 
28
         # no country given
35
         # no country given
29
-        duckduckgo.supported_languages = ['de-CH', 'en-US']
30
-        dicto['language'] = 'de'
36
+        dicto['language'] = 'en'
31
         params = duckduckgo.request(query, dicto)
37
         params = duckduckgo.request(query, dicto)
32
-        self.assertIn('ch-de', params['url'])
38
+        self.assertIn('us-en', params['url'])
33
 
39
 
34
     def test_no_url_in_request_year_time_range(self):
40
     def test_no_url_in_request_year_time_range(self):
35
         dicto = defaultdict(dict)
41
         dicto = defaultdict(dict)

+ 1
- 0
tests/unit/engines/test_duckduckgo_definitions.py View File

18
         self.assertEqual(result, 'Text in link')
18
         self.assertEqual(result, 'Text in link')
19
 
19
 
20
     def test_request(self):
20
     def test_request(self):
21
+        duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
21
         query = 'test_query'
22
         query = 'test_query'
22
         dicto = defaultdict(dict)
23
         dicto = defaultdict(dict)
23
         dicto['pageno'] = 1
24
         dicto['pageno'] = 1

+ 0
- 1
tests/unit/engines/test_duckduckgo_images.py View File

9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
11
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
12
-
13
         query = 'test_query'
12
         query = 'test_query'
14
         dicto = defaultdict(dict)
13
         dicto = defaultdict(dict)
15
         dicto['is_test'] = True
14
         dicto['is_test'] = True

+ 7
- 0
tests/unit/engines/test_google.py View File

15
         return response
15
         return response
16
 
16
 
17
     def test_request(self):
17
     def test_request(self):
18
+        google.supported_languages = ['en', 'fr', 'zh-CN']
19
+
18
         query = 'test_query'
20
         query = 'test_query'
19
         dicto = defaultdict(dict)
21
         dicto = defaultdict(dict)
20
         dicto['pageno'] = 1
22
         dicto['pageno'] = 1
31
         self.assertIn('google.co', params['url'])
33
         self.assertIn('google.co', params['url'])
32
         self.assertIn('en', params['headers']['Accept-Language'])
34
         self.assertIn('en', params['headers']['Accept-Language'])
33
 
35
 
36
+        dicto['language'] = 'zh'
37
+        params = google.request(query, dicto)
38
+        self.assertIn('google.com', params['url'])
39
+        self.assertIn('zh-CN', params['headers']['Accept-Language'])
40
+
34
     def test_response(self):
41
     def test_response(self):
35
         self.assertRaises(AttributeError, google.response, None)
42
         self.assertRaises(AttributeError, google.response, None)
36
         self.assertRaises(AttributeError, google.response, [])
43
         self.assertRaises(AttributeError, google.response, [])

+ 1
- 0
tests/unit/engines/test_google_news.py View File

9
 class TestGoogleNewsEngine(SearxTestCase):
9
 class TestGoogleNewsEngine(SearxTestCase):
10
 
10
 
11
     def test_request(self):
11
     def test_request(self):
12
+        google_news.supported_languages = ['en-US', 'fr-FR']
12
         query = 'test_query'
13
         query = 'test_query'
13
         dicto = defaultdict(dict)
14
         dicto = defaultdict(dict)
14
         dicto['pageno'] = 1
15
         dicto['pageno'] = 1

+ 1
- 1
tests/unit/engines/test_qwant.py View File

7
 class TestQwantEngine(SearxTestCase):
7
 class TestQwantEngine(SearxTestCase):
8
 
8
 
9
     def test_request(self):
9
     def test_request(self):
10
+        qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
10
         query = 'test_query'
11
         query = 'test_query'
11
         dicto = defaultdict(dict)
12
         dicto = defaultdict(dict)
12
         dicto['pageno'] = 0
13
         dicto['pageno'] = 0
26
         self.assertIn('en_us', params['url'])
27
         self.assertIn('en_us', params['url'])
27
         self.assertIn('news', params['url'])
28
         self.assertIn('news', params['url'])
28
 
29
 
29
-        qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
30
         dicto['language'] = 'fr'
30
         dicto['language'] = 'fr'
31
         params = qwant.request(query, dicto)
31
         params = qwant.request(query, dicto)
32
         self.assertIn('fr_fr', params['url'])
32
         self.assertIn('fr_fr', params['url'])

+ 1
- 0
tests/unit/engines/test_swisscows.py View File

7
 class TestSwisscowsEngine(SearxTestCase):
7
 class TestSwisscowsEngine(SearxTestCase):
8
 
8
 
9
     def test_request(self):
9
     def test_request(self):
10
+        swisscows.supported_languages = ['de-AT', 'de-DE']
10
         query = 'test_query'
11
         query = 'test_query'
11
         dicto = defaultdict(dict)
12
         dicto = defaultdict(dict)
12
         dicto['pageno'] = 1
13
         dicto['pageno'] = 1

+ 1
- 0
tests/unit/engines/test_wikidata.py View File

9
 class TestWikidataEngine(SearxTestCase):
9
 class TestWikidataEngine(SearxTestCase):
10
 
10
 
11
     def test_request(self):
11
     def test_request(self):
12
+        wikidata.supported_languages = ['en', 'es']
12
         query = 'test_query'
13
         query = 'test_query'
13
         dicto = defaultdict(dict)
14
         dicto = defaultdict(dict)
14
         dicto['language'] = 'en-US'
15
         dicto['language'] = 'en-US'

+ 14
- 3
tests/unit/engines/test_yahoo.py View File

25
         self.assertEqual('https://this.is.the.url/', url)
25
         self.assertEqual('https://this.is.the.url/', url)
26
 
26
 
27
     def test_request(self):
27
     def test_request(self):
28
+        yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
28
         query = 'test_query'
29
         query = 'test_query'
29
         dicto = defaultdict(dict)
30
         dicto = defaultdict(dict)
30
         dicto['pageno'] = 1
31
         dicto['pageno'] = 1
31
         dicto['time_range'] = ''
32
         dicto['time_range'] = ''
32
-        dicto['language'] = 'fr_FR'
33
+        dicto['language'] = 'fr-FR'
33
         params = yahoo.request(query, dicto)
34
         params = yahoo.request(query, dicto)
34
         self.assertIn('url', params)
35
         self.assertIn('url', params)
35
         self.assertIn(query, params['url'])
36
         self.assertIn(query, params['url'])
39
         self.assertIn('sB', params['cookies'])
40
         self.assertIn('sB', params['cookies'])
40
         self.assertIn('fr', params['cookies']['sB'])
41
         self.assertIn('fr', params['cookies']['sB'])
41
 
42
 
43
+        dicto['language'] = 'zh'
44
+        params = yahoo.request(query, dicto)
45
+        self.assertIn('zh_chs', params['url'])
46
+        self.assertIn('zh_chs', params['cookies']['sB'])
47
+
48
+        dicto['language'] = 'zh-TW'
49
+        params = yahoo.request(query, dicto)
50
+        self.assertIn('zh_cht', params['url'])
51
+        self.assertIn('zh_cht', params['cookies']['sB'])
52
+
42
     def test_no_url_in_request_year_time_range(self):
53
     def test_no_url_in_request_year_time_range(self):
43
         dicto = defaultdict(dict)
54
         dicto = defaultdict(dict)
44
         query = 'test_query'
55
         query = 'test_query'
168
         self.assertEqual(type(languages), list)
179
         self.assertEqual(type(languages), list)
169
         self.assertEqual(len(languages), 3)
180
         self.assertEqual(len(languages), 3)
170
         self.assertIn('ar', languages)
181
         self.assertIn('ar', languages)
171
-        self.assertIn('zh-chs', languages)
172
-        self.assertIn('zh-cht', languages)
182
+        self.assertIn('zh-CHS', languages)
183
+        self.assertIn('zh-CHT', languages)

+ 2
- 1
tests/unit/engines/test_yahoo_news.py View File

9
 class TestYahooNewsEngine(SearxTestCase):
9
 class TestYahooNewsEngine(SearxTestCase):
10
 
10
 
11
     def test_request(self):
11
     def test_request(self):
12
+        yahoo_news.supported_languages = ['en', 'fr']
12
         query = 'test_query'
13
         query = 'test_query'
13
         dicto = defaultdict(dict)
14
         dicto = defaultdict(dict)
14
         dicto['pageno'] = 1
15
         dicto['pageno'] = 1
15
-        dicto['language'] = 'fr_FR'
16
+        dicto['language'] = 'fr-FR'
16
         params = yahoo_news.request(query, dicto)
17
         params = yahoo_news.request(query, dicto)
17
         self.assertIn('url', params)
18
         self.assertIn('url', params)
18
         self.assertIn(query, params['url'])
19
         self.assertIn(query, params['url'])

+ 25
- 0
tests/unit/test_utils.py View File

65
         for test_url, expected in data:
65
         for test_url, expected in data:
66
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
66
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
67
 
67
 
68
+    def test_match_language(self):
69
+        self.assertEqual(utils.match_language('es', ['es']), 'es')
70
+        self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
71
+        self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
72
+
73
+        aliases = {'en-GB': 'en-UK', 'he': 'iw'}
74
+
75
+        # guess country
76
+        self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
77
+        self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
78
+        self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
79
+        self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
80
+        self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
81
+        self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
82
+
83
+        # language aliases
84
+        self.assertEqual(utils.match_language('iw', ['he']), 'he')
85
+        self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
86
+        self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
87
+        self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
88
+        self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
89
+        self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
90
+        self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
91
+        self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
92
+
68
 
93
 
69
 class TestHTMLTextExtractor(SearxTestCase):
94
 class TestHTMLTextExtractor(SearxTestCase):
70
 
95
 

+ 131
- 131
utils/fetch_languages.py View File

2
 
2
 
3
 # This script generates languages.py from intersecting each engine's supported languages.
3
 # This script generates languages.py from intersecting each engine's supported languages.
4
 #
4
 #
5
-# The country names are obtained from http://api.geonames.org which requires registering as a user.
6
-#
7
 # Output files (engines_languages.json and languages.py)
5
 # Output files (engines_languages.json and languages.py)
8
 # are written in current directory to avoid overwriting in case something goes wrong.
6
 # are written in current directory to avoid overwriting in case something goes wrong.
9
 
7
 
10
-from requests import get
11
-from lxml.html import fromstring
12
-from json import loads, dump
8
+from json import dump
13
 import io
9
 import io
14
 from sys import path
10
 from sys import path
11
+from babel import Locale, UnknownLocaleError
12
+from babel.languages import get_global
13
+
15
 path.append('../searx')  # noqa
14
 path.append('../searx')  # noqa
16
 from searx import settings
15
 from searx import settings
17
-from searx.url_utils import urlencode
18
 from searx.engines import initialize_engines, engines
16
 from searx.engines import initialize_engines, engines
19
 
17
 
20
-# Geonames API for country names.
21
-geonames_user = ''  # ADD USER NAME HERE
22
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
23
-
24
 # Output files.
18
 # Output files.
25
 engines_languages_file = 'engines_languages.json'
19
 engines_languages_file = 'engines_languages.json'
26
 languages_file = 'languages.py'
20
 languages_file = 'languages.py'
27
 
21
 
28
-engines_languages = {}
29
-
30
-
31
-# To filter out invalid codes and dialects.
32
-def valid_code(lang_code):
33
-    # filter invalid codes
34
-    # sl-SL is technically not invalid, but still a mistake
35
-    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
36
-    invalid_countries = ['UK', 'XA', 'XL']
37
-    if lang_code[:2] == 'xx'\
38
-       or lang_code in invalid_codes\
39
-       or lang_code[-2:] in invalid_countries\
40
-       or is_dialect(lang_code):
41
-        return False
42
-
43
-    return True
44
-
45
-
46
-# Language codes with any additional tags other than language and country.
47
-def is_dialect(lang_code):
48
-    lang_code = lang_code.split('-')
49
-    if len(lang_code) > 2 or len(lang_code[0]) > 3:
50
-        return True
51
-    if len(lang_code) == 2 and len(lang_code[1]) > 2:
52
-        return True
53
-
54
-    return False
55
-
56
-
57
-# Get country name in specified language.
58
-def get_country_name(locale):
59
-    if geonames_user is '':
60
-        return ''
61
-
62
-    locale = locale.split('-')
63
-    if len(locale) != 2:
64
-        return ''
65
-
66
-    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
67
-                                                         'country': locale[1],
68
-                                                         'username': geonames_user}))
69
-    response = get(url)
70
-    json = loads(response.text)
71
-    content = json.get('geonames', None)
72
-    if content is None or len(content) != 1:
73
-        print("No country name found for " + locale[0] + "-" + locale[1])
74
-        return ''
75
-
76
-    return content[0].get('countryName', '')
77
-
78
 
22
 
79
 # Fetchs supported languages for each engine and writes json file with those.
23
 # Fetchs supported languages for each engine and writes json file with those.
80
 def fetch_supported_languages():
24
 def fetch_supported_languages():
81
-    initialize_engines(settings['engines'])
25
+    engines_languages = {}
82
     for engine_name in engines:
26
     for engine_name in engines:
83
         if hasattr(engines[engine_name], 'fetch_supported_languages'):
27
         if hasattr(engines[engine_name], 'fetch_supported_languages'):
84
             try:
28
             try:
90
     with io.open(engines_languages_file, "w", encoding="utf-8") as f:
34
     with io.open(engines_languages_file, "w", encoding="utf-8") as f:
91
         dump(engines_languages, f, ensure_ascii=False)
35
         dump(engines_languages, f, ensure_ascii=False)
92
 
36
 
37
+    return engines_languages
38
+
39
+
40
+# Get babel Locale object from lang_code if possible.
41
+def get_locale(lang_code):
42
+    try:
43
+        locale = Locale.parse(lang_code, sep='-')
44
+        return locale
45
+    except (UnknownLocaleError, ValueError):
46
+        return None
47
+
48
+
49
+# Append engine_name to list of engines that support locale.
50
+def add_engine_counter(lang_code, engine_name, languages):
51
+    if lang_code in languages:
52
+        if 'counter' not in languages[lang_code]:
53
+            languages[lang_code]['counter'] = [engine_name]
54
+        elif engine_name not in languages[lang_code]['counter']:
55
+            languages[lang_code]['counter'].append(engine_name)
93
 
56
 
94
-# Join all language lists.
95
-# Iterate all languages supported by each engine.
96
-def join_language_lists():
97
-    global languages
98
-    # include wikipedia first for more accurate language names
99
-    languages = {code: lang for code, lang
100
-                 in engines_languages['wikipedia'].items()
101
-                 if valid_code(code)}
102
 
57
 
58
+# Join all language lists.
59
+# TODO: Add language names from engine's language list if name not known by babel.
60
+def join_language_lists(engines_languages):
61
+    language_list = {}
103
     for engine_name in engines_languages:
62
     for engine_name in engines_languages:
104
-        for locale in engines_languages[engine_name]:
105
-            if valid_code(locale):
106
-                # if language is not on list or if it has no name yet
107
-                if locale not in languages or not languages[locale].get('name'):
108
-                    if isinstance(engines_languages[engine_name], dict):
109
-                        languages[locale] = engines_languages[engine_name][locale]
110
-                    else:
111
-                        languages[locale] = {}
112
-
113
-            # add to counter of engines that support given language
114
-            lang = locale.split('-')[0]
115
-            if lang in languages:
116
-                if 'counter' not in languages[lang]:
117
-                    languages[lang]['counter'] = [engine_name]
118
-                elif engine_name not in languages[lang]['counter']:
119
-                    languages[lang]['counter'].append(engine_name)
120
-
121
-    # filter list to include only languages supported by most engines
122
-    min_supported_engines = int(0.70 * len(engines_languages))
123
-    languages = {code: lang for code, lang
124
-                 in languages.items()
125
-                 if len(lang.get('counter', [])) >= min_supported_engines or
126
-                 len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
127
-
128
-    # get locales that have no name or country yet
129
-    for locale in languages.keys():
130
-        # try to get language names
131
-        if not languages[locale].get('name'):
132
-            name = languages.get(locale.split('-')[0], {}).get('name', None)
133
-            if name:
134
-                languages[locale]['name'] = name
135
-            else:
136
-                # filter out locales with no name
137
-                del languages[locale]
138
-                continue
139
-
140
-        # try to get language name in english
141
-        if not languages[locale].get('english_name'):
142
-            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
143
-
144
-        # try to get country name
145
-        if locale.find('-') > 0 and not languages[locale].get('country'):
146
-            languages[locale]['country'] = get_country_name(locale) or ''
147
-
148
-
149
-# Remove countryless language if language is featured in only one country.
150
-def filter_single_country_languages():
151
-    prev_lang = None
152
-    prev_code = None
153
-    for code in sorted(languages):
154
-        lang = code.split('-')[0]
155
-        if lang == prev_lang:
63
+        for lang_code in engines_languages[engine_name]:
64
+
65
+            # apply custom fixes if necessary
66
+            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
67
+                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
68
+                                 if lang_code == alias)
69
+
70
+            locale = get_locale(lang_code)
71
+
72
+            # ensure that lang_code uses standard language and country codes
73
+            if locale and locale.territory:
74
+                lang_code = locale.language + '-' + locale.territory
75
+
76
+            # add locale if it's not in list
77
+            if lang_code not in language_list:
78
+                if locale:
79
+                    language_list[lang_code] = {'name': locale.get_language_name().title(),
80
+                                                'english_name': locale.english_name,
81
+                                                'country': locale.get_territory_name() or ''}
82
+
83
+                    # also add language without country
84
+                    if locale.language not in language_list:
85
+                        language_list[locale.language] = {'name': locale.get_language_name().title(),
86
+                                                          'english_name': locale.english_name}
87
+                else:
88
+                    language_list[lang_code] = {}
89
+
90
+            # count engine for both language_country combination and language alone
91
+            add_engine_counter(lang_code, engine_name, language_list)
92
+            add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
93
+
94
+    return language_list
95
+
96
+
97
+# Filter language list so it only includes the most supported languages and countries.
98
+def filter_language_list(all_languages):
99
+    min_supported_engines = 10
100
+    main_engines = [engine_name for engine_name in engines.keys()
101
+                    if 'general' in engines[engine_name].categories and
102
+                       engines[engine_name].supported_languages and
103
+                       not engines[engine_name].disabled]
104
+
105
+    # filter list to include only languages supported by most engines or all default general engines
106
+    filtered_languages = {code: lang for code, lang
107
+                          in all_languages.items()
108
+                          if (len(lang.get('counter', [])) >= min_supported_engines or
109
+                              all(main_engine in lang.get('counter', [])
110
+                                  for main_engine in main_engines))}
111
+
112
+    return filtered_languages
113
+
114
+
115
+# Add country codes to languages without one and filter out language codes.
116
+def assign_country_codes(filtered_languages, all_languages):
117
+    sorted_languages = sorted(all_languages,
118
+                              key=lambda lang: len(all_languages[lang].get('counter', [])),
119
+                              reverse=True)
120
+    previous_lang = None
121
+    previous_code = None
122
+    countries = 0
123
+    for current_code in sorted(filtered_languages):
124
+        current_lang = current_code.split('-')[0]
125
+
126
+        # count country codes per language
127
+        if current_lang == previous_lang:
156
             countries += 1
128
             countries += 1
129
+
157
         else:
130
         else:
158
-            if prev_lang is not None and countries == 1:
159
-                del languages[prev_lang]
160
-                languages[prev_code]['country'] = ''
131
+            if previous_lang is not None:
132
+                # if language has no single country code
133
+                if countries == 0:
134
+                    # try to get country code with most supported engines
135
+                    for l in sorted_languages:
136
+                        l_parts = l.split('-')
137
+                        if len(l_parts) == 2 and l_parts[0] == previous_lang:
138
+                            filtered_languages[l] = all_languages[l]
139
+                            filtered_languages[l]['country'] = ''
140
+                            countries = 1
141
+                            break
142
+
143
+                    if countries == 0:
144
+                        # get most likely country code from babel
145
+                        subtags = get_global('likely_subtags').get(previous_lang)
146
+                        if subtags:
147
+                            subtag_parts = subtags.split('_')
148
+                            new_code = subtag_parts[0] + '-' + subtag_parts[-1]
149
+                            filtered_languages[new_code] = all_languages[previous_lang]
150
+                            countries = 1
151
+
152
+                if countries == 1:
153
+                    # remove countryless version of language if there's only one country
154
+                    del filtered_languages[previous_lang]
155
+                    if previous_code in filtered_languages:
156
+                        filtered_languages[previous_code]['country'] = ''
157
+
161
             countries = 0
158
             countries = 0
162
-            prev_lang = lang
163
-        prev_code = code
159
+            previous_lang = current_lang
160
+
161
+        previous_code = current_code
164
 
162
 
165
 
163
 
166
 # Write languages.py.
164
 # Write languages.py.
167
-def write_languages_file():
165
+def write_languages_file(languages):
168
     new_file = open(languages_file, 'wb')
166
     new_file = open(languages_file, 'wb')
169
     file_content = '# -*- coding: utf-8 -*-\n'\
167
     file_content = '# -*- coding: utf-8 -*-\n'\
170
                    + '# list of language codes\n'\
168
                    + '# list of language codes\n'\
183
 
181
 
184
 
182
 
185
 if __name__ == "__main__":
183
 if __name__ == "__main__":
186
-    fetch_supported_languages()
187
-    join_language_lists()
188
-    filter_single_country_languages()
189
-    write_languages_file()
184
+    initialize_engines(settings['engines'])
185
+    engines_languages = fetch_supported_languages()
186
+    all_languages = join_language_lists(engines_languages)
187
+    filtered_languages = filter_language_list(all_languages)
188
+    assign_country_codes(filtered_languages, all_languages)
189
+    write_languages_file(filtered_languages)