Преглед на файлове

refactor engine's search language handling

Add match_language function in utils to match any user given
language code with a list of engine's supported languages.

Also add language_aliases dict on each engine to translate
standard language codes into the custom codes used by the engine.
Marc Abonce Seguin преди 6 години
родител
ревизия
772c048d01
променени са 42 файла, в които са добавени 274 реда и са изтрити 170 реда
  1. 1
    1
      searx/data/engines_languages.json
  2. 20
    1
      searx/engines/__init__.py
  3. 2
    2
      searx/engines/archlinux.py
  4. 4
    2
      searx/engines/bing.py
  5. 2
    21
      searx/engines/bing_images.py
  6. 3
    3
      searx/engines/bing_news.py
  7. 4
    3
      searx/engines/bing_videos.py
  8. 2
    1
      searx/engines/dailymotion.py
  9. 18
    29
      searx/engines/duckduckgo.py
  10. 4
    3
      searx/engines/duckduckgo_definitions.py
  11. 4
    1
      searx/engines/duckduckgo_images.py
  12. 14
    15
      searx/engines/google.py
  13. 4
    2
      searx/engines/google_news.py
  14. 3
    10
      searx/engines/qwant.py
  15. 3
    5
      searx/engines/swisscows.py
  16. 3
    2
      searx/engines/wikidata.py
  17. 2
    7
      searx/engines/wikipedia.py
  18. 12
    12
      searx/engines/yahoo.py
  19. 5
    2
      searx/engines/yahoo_news.py
  20. 0
    4
      searx/preferences.py
  21. 7
    3
      searx/query.py
  22. 2
    2
      searx/templates/oscar/preferences.html
  23. 61
    0
      searx/utils.py
  24. 15
    7
      searx/webapp.py
  25. 6
    1
      tests/unit/engines/test_archlinux.py
  26. 1
    0
      tests/unit/engines/test_bing.py
  27. 0
    1
      tests/unit/engines/test_bing_images.py
  28. 2
    1
      tests/unit/engines/test_bing_news.py
  29. 0
    1
      tests/unit/engines/test_bing_videos.py
  30. 2
    1
      tests/unit/engines/test_dailymotion.py
  31. 12
    6
      tests/unit/engines/test_duckduckgo.py
  32. 1
    0
      tests/unit/engines/test_duckduckgo_definitions.py
  33. 0
    1
      tests/unit/engines/test_duckduckgo_images.py
  34. 7
    0
      tests/unit/engines/test_google.py
  35. 1
    0
      tests/unit/engines/test_google_news.py
  36. 1
    1
      tests/unit/engines/test_qwant.py
  37. 1
    0
      tests/unit/engines/test_swisscows.py
  38. 1
    0
      tests/unit/engines/test_wikidata.py
  39. 14
    3
      tests/unit/engines/test_yahoo.py
  40. 2
    1
      tests/unit/engines/test_yahoo_news.py
  41. 25
    0
      tests/unit/test_utils.py
  42. 3
    15
      utils/fetch_languages.py

+ 1
- 1
searx/data/engines_languages.json
Файловите разлики са ограничени, защото са твърде много
Целия файл


+ 20
- 1
searx/engines/__init__.py Целия файл

@@ -20,13 +20,14 @@ import sys
20 20
 import threading
21 21
 from os.path import realpath, dirname
22 22
 from io import open
23
+from babel.localedata import locale_identifiers
23 24
 from flask_babel import gettext
24 25
 from operator import itemgetter
25 26
 from json import loads
26 27
 from requests import get
27 28
 from searx import settings
28 29
 from searx import logger
29
-from searx.utils import load_module
30
+from searx.utils import load_module, match_language
30 31
 
31 32
 
32 33
 logger = logger.getChild('engines')
@@ -38,6 +39,8 @@ engines = {}
38 39
 categories = {'general': []}
39 40
 
40 41
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
42
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
43
+               for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
41 44
 
42 45
 engine_shortcuts = {}
43 46
 engine_default_args = {'paging': False,
@@ -97,6 +100,22 @@ def load_engine(engine_data):
97 100
     if engine_data['name'] in languages:
98 101
         setattr(engine, 'supported_languages', languages[engine_data['name']])
99 102
 
103
+    # find custom aliases for non standard language codes
104
+    if hasattr(engine, 'supported_languages'):
105
+        if hasattr(engine, 'language_aliases'):
106
+            language_aliases = getattr(engine, 'language_aliases')
107
+        else:
108
+            language_aliases = {}
109
+
110
+        for engine_lang in getattr(engine, 'supported_languages'):
111
+            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
112
+            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
113
+               iso_lang not in getattr(engine, 'supported_languages'):
114
+                language_aliases[iso_lang] = engine_lang
115
+
116
+        if language_aliases:
117
+            setattr(engine, 'language_aliases', language_aliases)
118
+
100 119
     # assign language fetching method if auxiliary method exists
101 120
     if hasattr(engine, '_fetch_supported_languages'):
102 121
         setattr(engine, 'fetch_supported_languages',

+ 2
- 2
searx/engines/archlinux.py Целия файл

@@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs)
99 99
 
100 100
 # do search-request
101 101
 def request(query, params):
102
-    # translate the locale (e.g. 'en_US') to language code ('en')
102
+    # translate the locale (e.g. 'en-US') to language code ('en')
103 103
     language = locale_to_lang_code(params['language'])
104 104
 
105 105
     # if our language is hosted on the main site, we need to add its name
106 106
     # to the query in order to narrow the results to that language
107 107
     if language in main_langs:
108
-        query += '(' + main_langs[language] + ')'
108
+        query += b' (' + main_langs[language] + b')'
109 109
 
110 110
     # prepare the request parameters
111 111
     query = urlencode({'search': query})

+ 4
- 2
searx/engines/bing.py Целия файл

@@ -16,12 +16,14 @@
16 16
 from lxml import html
17 17
 from searx.engines.xpath import extract_text
18 18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19 20
 
20 21
 # engine dependent config
21 22
 categories = ['general']
22 23
 paging = True
23 24
 language_support = True
24 25
 supported_languages_url = 'https://www.bing.com/account/general'
26
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
25 27
 
26 28
 # search-url
27 29
 base_url = 'https://www.bing.com/'
@@ -32,9 +34,9 @@ search_string = 'search?{query}&first={offset}'
32 34
 def request(query, params):
33 35
     offset = (params['pageno'] - 1) * 10 + 1
34 36
 
35
-    lang = params['language'].split('-')[0].upper()
37
+    lang = match_language(params['language'], supported_languages, language_aliases)
36 38
 
37
-    query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
39
+    query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
38 40
 
39 41
     search_path = search_string.format(
40 42
         query=urlencode({'q': query}),

+ 2
- 21
searx/engines/bing_images.py Целия файл

@@ -19,6 +19,7 @@ from lxml import html
19 19
 from json import loads
20 20
 import re
21 21
 from searx.url_utils import urlencode
22
+from searx.utils import match_language
22 23
 
23 24
 # engine dependent config
24 25
 categories = ['images']
@@ -46,26 +47,6 @@ safesearch_types = {2: 'STRICT',
46 47
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
47 48
 
48 49
 
49
-# get supported region code
50
-def get_region_code(lang, lang_list=None):
51
-    region = None
52
-    if lang in (lang_list or supported_languages):
53
-        region = lang
54
-    elif lang.startswith('no'):
55
-        region = 'nb-NO'
56
-    else:
57
-        # try to get a supported country code with language
58
-        lang = lang.split('-')[0]
59
-        for lc in (lang_list or supported_languages):
60
-            if lang == lc.split('-')[0]:
61
-                region = lc
62
-                break
63
-    if region:
64
-        return region.lower()
65
-    else:
66
-        return 'en-us'
67
-
68
-
69 50
 # do search-request
70 51
 def request(query, params):
71 52
     offset = (params['pageno'] - 1) * 10 + 1
@@ -74,7 +55,7 @@ def request(query, params):
74 55
         query=urlencode({'q': query}),
75 56
         offset=offset)
76 57
 
77
-    language = get_region_code(params['language'])
58
+    language = match_language(params['language'], supported_languages).lower()
78 59
 
79 60
     params['cookies']['SRCHHPGUSR'] = \
80 61
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

+ 3
- 3
searx/engines/bing_news.py Целия файл

@@ -14,8 +14,8 @@
14 14
 from datetime import datetime
15 15
 from dateutil import parser
16 16
 from lxml import etree
17
-from searx.utils import list_get
18
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
17
+from searx.utils import list_get, match_language
18
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
19 19
 from searx.url_utils import urlencode, urlparse, parse_qsl
20 20
 
21 21
 # engine dependent config
@@ -71,7 +71,7 @@ def request(query, params):
71 71
 
72 72
     offset = (params['pageno'] - 1) * 10 + 1
73 73
 
74
-    language = params['language']
74
+    language = match_language(params['language'], supported_languages, language_aliases)
75 75
 
76 76
     params['url'] = _get_url(query, language, offset, params['time_range'])
77 77
 

+ 4
- 3
searx/engines/bing_videos.py Целия файл

@@ -12,9 +12,10 @@
12 12
 
13 13
 from json import loads
14 14
 from lxml import html
15
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
15
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
16 16
 from searx.engines.xpath import extract_text
17 17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18 19
 
19 20
 
20 21
 categories = ['videos']
@@ -47,8 +48,8 @@ def request(query, params):
47 48
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
48 49
 
49 50
     # language cookie
50
-    region = get_region_code(params['language'], lang_list=supported_languages)
51
-    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
51
+    language = match_language(params['language'], supported_languages).lower()
52
+    params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
52 53
 
53 54
     # query and paging
54 55
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 2
- 1
searx/engines/dailymotion.py Целия файл

@@ -15,6 +15,7 @@
15 15
 from json import loads
16 16
 from datetime import datetime
17 17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18 19
 
19 20
 # engine dependent config
20 21
 categories = ['videos']
@@ -32,7 +33,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages'
32 33
 
33 34
 # do search-request
34 35
 def request(query, params):
35
-    locale = params['language']
36
+    locale = match_language(params['language'], supported_languages)
36 37
 
37 38
     params['url'] = search_url.format(
38 39
         query=urlencode({'search': query, 'localization': locale}),

+ 18
- 29
searx/engines/duckduckgo.py Целия файл

@@ -18,6 +18,7 @@ from json import loads
18 18
 from searx.engines.xpath import extract_text
19 19
 from searx.poolrequests import get
20 20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
21 22
 
22 23
 # engine dependent config
23 24
 categories = ['general']
@@ -26,6 +27,16 @@ language_support = True
26 27
 supported_languages_url = 'https://duckduckgo.com/util/u172.js'
27 28
 time_range_support = True
28 29
 
30
+language_aliases = {
31
+    'ar-SA': 'ar-XA',
32
+    'es-419': 'es-XL',
33
+    'ja': 'jp-JP',
34
+    'ko': 'kr-KR',
35
+    'sl-SI': 'sl-SL',
36
+    'zh-TW': 'tzh-TW',
37
+    'zh-HK': 'tzh-HK'
38
+}
39
+
29 40
 # search-url
30 41
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
31 42
 time_range_url = '&df={range}'
@@ -42,34 +53,12 @@ content_xpath = './/a[@class="result__snippet"]'
42 53
 
43 54
 
44 55
 # match query's language to a region code that duckduckgo will accept
45
-def get_region_code(lang, lang_list=None):
46
-    # custom fixes for languages
47
-    if lang[:2] == 'ja':
48
-        region_code = 'jp-jp'
49
-    elif lang[:2] == 'sl':
50
-        region_code = 'sl-sl'
51
-    elif lang == 'zh-TW':
52
-        region_code = 'tw-tzh'
53
-    elif lang == 'zh-HK':
54
-        region_code = 'hk-tzh'
55
-    elif lang[-2:] == 'SA':
56
-        region_code = 'xa-' + lang.split('-')[0]
57
-    elif lang[-2:] == 'GB':
58
-        region_code = 'uk-' + lang.split('-')[0]
59
-    else:
60
-        region_code = lang.split('-')
61
-        if len(region_code) == 2:
62
-            # country code goes first
63
-            region_code = region_code[1].lower() + '-' + region_code[0].lower()
64
-        else:
65
-            # tries to get a country code from language
66
-            region_code = region_code[0].lower()
67
-            for lc in (lang_list or supported_languages):
68
-                lc = lc.split('-')
69
-                if region_code == lc[0]:
70
-                    region_code = lc[1].lower() + '-' + lc[0].lower()
71
-                    break
72
-    return region_code
56
+def get_region_code(lang, lang_list=[]):
57
+    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
58
+    lang_parts = lang_code.split('-')
59
+
60
+    # country code goes first
61
+    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
73 62
 
74 63
 
75 64
 # do search-request
@@ -79,7 +68,7 @@ def request(query, params):
79 68
 
80 69
     offset = (params['pageno'] - 1) * 30
81 70
 
82
-    region_code = get_region_code(params['language'])
71
+    region_code = get_region_code(params['language'], supported_languages)
83 72
     params['url'] = url.format(
84 73
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
85 74
 

+ 4
- 3
searx/engines/duckduckgo_definitions.py Целия файл

@@ -2,9 +2,9 @@ import json
2 2
 from lxml import html
3 3
 from re import compile
4 4
 from searx.engines.xpath import extract_text
5
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
5
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
6 6
 from searx.url_utils import urlencode
7
-from searx.utils import html_to_text
7
+from searx.utils import html_to_text, match_language
8 8
 
9 9
 url = 'https://api.duckduckgo.com/'\
10 10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
@@ -24,7 +24,8 @@ def result_to_text(url, text, htmlResult):
24 24
 
25 25
 def request(query, params):
26 26
     params['url'] = url.format(query=urlencode({'q': query}))
27
-    params['headers']['Accept-Language'] = params['language'].split('-')[0]
27
+    language = match_language(params['language'], supported_languages, language_aliases)
28
+    params['headers']['Accept-Language'] = language.split('-')[0]
28 29
     return params
29 30
 
30 31
 

+ 4
- 1
searx/engines/duckduckgo_images.py Целия файл

@@ -15,7 +15,10 @@
15 15
 
16 16
 from json import loads
17 17
 from searx.engines.xpath import extract_text
18
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
18
+from searx.engines.duckduckgo import (
19
+    _fetch_supported_languages, supported_languages_url,
20
+    get_region_code, language_aliases
21
+)
19 22
 from searx.poolrequests import get
20 23
 from searx.url_utils import urlencode
21 24
 

+ 14
- 15
searx/engines/google.py Целия файл

@@ -14,6 +14,7 @@ from lxml import html, etree
14 14
 from searx.engines.xpath import extract_text, extract_url
15 15
 from searx import logger
16 16
 from searx.url_utils import urlencode, urlparse, parse_qsl
17
+from searx.utils import match_language
17 18
 
18 19
 logger = logger.getChild('google engine')
19 20
 
@@ -165,22 +166,20 @@ def extract_text_from_dom(result, xpath):
165 166
 def request(query, params):
166 167
     offset = (params['pageno'] - 1) * 10
167 168
 
169
+    language = match_language(params['language'], supported_languages)
170
+    language_array = language.split('-')
171
+    if params['language'].find('-') > 0:
172
+        country = params['language'].split('-')[1]
173
+    elif len(language_array) == 2:
174
+        country = language_array[1]
175
+    else:
176
+        country = 'US'
177
+
168 178
     # temporary fix until a way of supporting en-US is found
169
-    if params['language'] == 'en-US':
170
-        params['language'] = 'en-GB'
179
+    if language == 'en-US':
180
+        country = 'GB'
171 181
 
172
-    if params['language'][:2] == 'jv':
173
-        language = 'jw'
174
-        country = 'ID'
175
-        url_lang = 'lang_jw'
176
-    else:
177
-        language_array = params['language'].lower().split('-')
178
-        if len(language_array) == 2:
179
-            country = language_array[1]
180
-        else:
181
-            country = 'US'
182
-        language = language_array[0] + ',' + language_array[0] + '-' + country
183
-        url_lang = 'lang_' + language_array[0]
182
+    url_lang = 'lang_' + language
184 183
 
185 184
     if use_locale_domain:
186 185
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
@@ -196,7 +195,7 @@ def request(query, params):
196 195
     if params['time_range'] in time_range_dict:
197 196
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
198 197
 
199
-    params['headers']['Accept-Language'] = language
198
+    params['headers']['Accept-Language'] = language + ',' + language + '-' + country
200 199
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
201 200
 
202 201
     params['google_hostname'] = google_hostname

+ 4
- 2
searx/engines/google_news.py Целия файл

@@ -13,6 +13,7 @@
13 13
 from lxml import html
14 14
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
15 15
 from searx.url_utils import urlencode
16
+from searx.utils import match_language
16 17
 
17 18
 # search-url
18 19
 categories = ['news']
@@ -50,8 +51,9 @@ def request(query, params):
50 51
     params['url'] = search_url.format(query=urlencode({'q': query}),
51 52
                                       search_options=urlencode(search_options))
52 53
 
53
-    language_array = params['language'].lower().split('-')
54
-    params['url'] += '&lr=lang_' + language_array[0]
54
+    language = match_language(params['language'], supported_languages).split('-')[0]
55
+    if language:
56
+        params['url'] += '&lr=lang_' + language
55 57
 
56 58
     return params
57 59
 

+ 3
- 10
searx/engines/qwant.py Целия файл

@@ -14,6 +14,7 @@ from datetime import datetime
14 14
 from json import loads
15 15
 from searx.utils import html_to_text
16 16
 from searx.url_utils import urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = None
@@ -45,16 +46,8 @@ def request(query, params):
45 46
                                    offset=offset)
46 47
 
47 48
     # add language tag
48
-    if params['language'] == 'no' or params['language'].startswith('no-'):
49
-        params['language'] = params['language'].replace('no', 'nb', 1)
50
-    if params['language'].find('-') < 0:
51
-        # tries to get a country code from language
52
-        for lang in supported_languages:
53
-            lc = lang.split('-')
54
-            if params['language'] == lc[0]:
55
-                params['language'] = lang
56
-                break
57
-    params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
49
+    language = match_language(params['language'], supported_languages)
50
+    params['url'] += '&locale=' + language.replace('-', '_').lower()
58 51
 
59 52
     return params
60 53
 

+ 3
- 5
searx/engines/swisscows.py Целия файл

@@ -14,6 +14,7 @@ from json import loads
14 14
 import re
15 15
 from lxml.html import fromstring
16 16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = ['general', 'images']
@@ -35,11 +36,8 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
35 36
 
36 37
 # do search-request
37 38
 def request(query, params):
38
-    if params['language'].split('-')[0] == 'no':
39
-        region = 'nb-NO'
40
-    else:
41
-        region = params['language']
42
-        ui_language = params['language'].split('-')[0]
39
+    region = match_language(params['language'], supported_languages)
40
+    ui_language = region.split('-')[0]
43 41
 
44 42
     search_path = search_string.format(
45 43
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

+ 3
- 2
searx/engines/wikidata.py Целия файл

@@ -16,6 +16,7 @@ from searx.poolrequests import get
16 16
 from searx.engines.xpath import extract_text
17 17
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
18 18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19 20
 
20 21
 from json import loads
21 22
 from lxml.html import fromstring
@@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
56 57
 
57 58
 
58 59
 def request(query, params):
59
-    language = params['language'].split('-')[0]
60
+    language = match_language(params['language'], supported_languages).split('-')[0]
60 61
 
61 62
     params['url'] = url_search.format(
62 63
         query=urlencode({'label': query, 'language': language}))
@@ -68,7 +69,7 @@ def response(resp):
68 69
     html = fromstring(resp.text)
69 70
     wikidata_ids = html.xpath(wikidata_ids_xpath)
70 71
 
71
-    language = resp.search_params['language'].split('-')[0]
72
+    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
72 73
 
73 74
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
74 75
     for wikidata_id in wikidata_ids[:result_count]:

+ 2
- 7
searx/engines/wikipedia.py Целия файл

@@ -13,6 +13,7 @@
13 13
 from json import loads
14 14
 from lxml.html import fromstring
15 15
 from searx.url_utils import quote, urlencode
16
+from searx.utils import match_language
16 17
 
17 18
 # search-url
18 19
 base_url = u'https://{language}.wikipedia.org/'
@@ -30,13 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
30 31
 
31 32
 # set language in base_url
32 33
 def url_lang(lang):
33
-    lang = lang.split('-')[0]
34
-    if lang not in supported_languages:
35
-        language = 'en'
36
-    else:
37
-        language = lang
38
-
39
-    return language
34
+    return match_language(lang, supported_languages).split('-')[0]
40 35
 
41 36
 
42 37
 # do search-request

+ 12
- 12
searx/engines/yahoo.py Целия файл

@@ -14,6 +14,7 @@
14 14
 from lxml import html
15 15
 from searx.engines.xpath import extract_text, extract_url
16 16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = ['general']
@@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'],
39 40
                    'week': ['1w', 'w'],
40 41
                    'month': ['1m', 'm']}
41 42
 
43
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
44
+
42 45
 
43 46
 # remove yahoo-specific tracking-url
44 47
 def parse_url(url_string):
@@ -70,23 +73,16 @@ def _get_url(query, offset, language, time_range):
70 73
                                         lang=language)
71 74
 
72 75
 
73
-def _get_language(params):
74
-    if params['language'][:2] == 'zh':
75
-        if params['language'] == 'zh' or params['language'] == 'zh-CH':
76
-            return 'szh'
77
-        else:
78
-            return 'tzh'
79
-    else:
80
-        return params['language'].split('-')[0]
81
-
82
-
83 76
 # do search-request
84 77
 def request(query, params):
85 78
     if params['time_range'] and params['time_range'] not in time_range_dict:
86 79
         return params
87 80
 
88 81
     offset = (params['pageno'] - 1) * 10 + 1
89
-    language = _get_language(params)
82
+    language = match_language(params['language'], supported_languages, language_aliases)
83
+    if language not in language_aliases.values():
84
+        language = language.split('-')[0]
85
+    language = language.replace('-', '_').lower()
90 86
 
91 87
     params['url'] = _get_url(query, offset, language, params['time_range'])
92 88
 
@@ -145,7 +141,11 @@ def _fetch_supported_languages(resp):
145 141
     dom = html.fromstring(resp.text)
146 142
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
147 143
     for option in options:
148
-        code = option.xpath('./@value')[0][5:].replace('_', '-')
144
+        code_parts = option.xpath('./@value')[0][5:].split('_')
145
+        if len(code_parts) == 2:
146
+            code = code_parts[0] + '-' + code_parts[1].upper()
147
+        else:
148
+            code = code_parts[0]
149 149
         supported_languages.append(code)
150 150
 
151 151
     return supported_languages

+ 5
- 2
searx/engines/yahoo_news.py Целия файл

@@ -13,9 +13,12 @@ import re
13 13
 from datetime import datetime, timedelta
14 14
 from lxml import html
15 15
 from searx.engines.xpath import extract_text, extract_url
16
-from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
16
+from searx.engines.yahoo import (
17
+    parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
18
+)
17 19
 from dateutil import parser
18 20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
19 22
 
20 23
 # engine dependent config
21 24
 categories = ['news']
@@ -38,7 +41,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
38 41
 def request(query, params):
39 42
     offset = (params['pageno'] - 1) * 10 + 1
40 43
 
41
-    language = params['language'].split('-')[0]
44
+    language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
42 45
 
43 46
     params['url'] = search_url.format(offset=offset,
44 47
                                       query=urlencode({'p': query}),

+ 0
- 4
searx/preferences.py Целия файл

@@ -115,10 +115,6 @@ class SearchLanguageSetting(EnumStringSetting):
115 115
                 pass
116 116
             elif lang in self.choices:
117 117
                 data = lang
118
-            elif data == 'nb-NO':
119
-                data = 'no-NO'
120
-            elif data == 'ar-XA':
121
-                data = 'ar-SA'
122 118
             else:
123 119
                 data = self.value
124 120
         self.value = data

+ 7
- 3
searx/query.py Целия файл

@@ -96,9 +96,13 @@ class RawTextQuery(object):
96 96
                                 break
97 97
 
98 98
                 # user may set a valid, yet not selectable language
99
-                if not self.languages and VALID_LANGUAGE_CODE.match(lang):
100
-                    self.languages.append(lang)
101
-                    parse_next = True
99
+                if VALID_LANGUAGE_CODE.match(lang):
100
+                    lang_parts = lang.split('-')
101
+                    if len(lang_parts) > 1:
102
+                        lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
103
+                    if lang not in self.languages:
104
+                        self.languages.append(lang)
105
+                        parse_next = True
102 106
 
103 107
             # this force a engine or category
104 108
             if query_part[0] == '!' or query_part[0] == '?':

+ 2
- 2
searx/templates/oscar/preferences.html Целия файл

@@ -187,7 +187,7 @@
187 187
                                     </td>
188 188
                                     <th>{{ search_engine.name }}</th>
189 189
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
190
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
190
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
191 191
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
192 192
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
193 193
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
@@ -197,7 +197,7 @@
197 197
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
198 198
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
199 199
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
200
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
200
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
201 201
 					<td>{{ shortcuts[search_engine.name] }}</td>
202 202
                                     <th>{{ search_engine.name }}</th>
203 203
                                     <td class="onoff-checkbox">

+ 61
- 0
searx/utils.py Целия файл

@@ -4,6 +4,7 @@ import hmac
4 4
 import os
5 5
 import re
6 6
 
7
+from babel.core import get_global
7 8
 from babel.dates import format_date
8 9
 from codecs import getincrementalencoder
9 10
 from imp import load_source
@@ -12,6 +13,7 @@ from os.path import splitext, join
12 13
 from random import choice
13 14
 import sys
14 15
 
16
+from searx import settings
15 17
 from searx.version import VERSION_STRING
16 18
 from searx.languages import language_codes
17 19
 from searx import settings
@@ -322,6 +324,65 @@ def is_valid_lang(lang):
322 324
         return False
323 325
 
324 326
 
327
+# auxiliary function to match lang_code in lang_list
328
+def _match_language(lang_code, lang_list=[], custom_aliases={}):
329
+    # replace language code with a custom alias if necessary
330
+    if lang_code in custom_aliases:
331
+        lang_code = custom_aliases[lang_code]
332
+
333
+    if lang_code in lang_list:
334
+        return lang_code
335
+
336
+    # try to get the most likely country for this language
337
+    subtags = get_global('likely_subtags').get(lang_code)
338
+    if subtags:
339
+        subtag_parts = subtags.split('_')
340
+        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
341
+        if new_code in custom_aliases:
342
+            new_code = custom_aliases[new_code]
343
+        if new_code in lang_list:
344
+            return new_code
345
+
346
+    # try to get the any supported country for this language
347
+    for lc in lang_list:
348
+        if lang_code == lc.split('-')[0]:
349
+            return lc
350
+
351
+    return None
352
+
353
+
354
+# get the language code from lang_list that best matches locale_code
355
+def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
356
+    # try to get language from given locale_code
357
+    language = _match_language(locale_code, lang_list, custom_aliases)
358
+    if language:
359
+        return language
360
+
361
+    locale_parts = locale_code.split('-')
362
+    lang_code = locale_parts[0]
363
+
364
+    # try to get language using an equivalent country code
365
+    if len(locale_parts) > 1:
366
+        country_alias = get_global('territory_aliases').get(locale_parts[-1])
367
+        if country_alias:
368
+            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
369
+            if language:
370
+                return language
371
+
372
+    # try to get language using an equivalent language code
373
+    alias = get_global('language_aliases').get(lang_code)
374
+    if alias:
375
+        language = _match_language(alias, lang_list, custom_aliases)
376
+        if language:
377
+            return language
378
+
379
+    if lang_code != locale_code:
380
+        # try to get language from given language without giving the country
381
+        language = _match_language(lang_code, lang_list, custom_aliases)
382
+
383
+    return language or fallback
384
+
385
+
325 386
 def load_module(filename, module_dir):
326 387
     modname = splitext(filename)[0]
327 388
     if modname in sys.modules:

+ 15
- 7
searx/webapp.py Целия файл

@@ -58,16 +58,16 @@ from searx.engines import (
58 58
 from searx.utils import (
59 59
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
60 60
     get_static_files, get_result_templates, get_themes, gen_useragent,
61
-    dict_subset, prettify_url
61
+    dict_subset, prettify_url, match_language
62 62
 )
63 63
 from searx.version import VERSION_STRING
64
-from searx.languages import language_codes
64
+from searx.languages import language_codes as languages
65 65
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
66 66
 from searx.query import RawTextQuery
67 67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
68 68
 from searx.plugins import plugins
69 69
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
70
-from searx.preferences import Preferences, ValidationException
70
+from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
71 71
 from searx.answerers import answerers
72 72
 from searx.url_utils import urlencode, urlparse, urljoin
73 73
 from searx.utils import new_hmac
@@ -133,7 +133,7 @@ if not searx_debug \
133 133
 babel = Babel(app)
134 134
 
135 135
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
136
-               'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
136
+               'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
137 137
 
138 138
 # used when translating category names
139 139
 _category_names = (gettext('files'),
@@ -352,9 +352,11 @@ def render(template_name, override_theme=None, **kwargs):
352 352
 
353 353
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
354 354
 
355
-    kwargs['language_codes'] = language_codes
355
+    kwargs['language_codes'] = languages
356 356
     if 'current_language' not in kwargs:
357
-        kwargs['current_language'] = request.preferences.get_value('language')
357
+        kwargs['current_language'] = match_language(request.preferences.get_value('language'),
358
+                                                    LANGUAGE_CODES,
359
+                                                    fallback=settings['search']['language'])
358 360
 
359 361
     # override url_for function in templates
360 362
     kwargs['url_for'] = url_for_theme
@@ -590,7 +592,9 @@ def index():
590 592
         infoboxes=result_container.infoboxes,
591 593
         paging=result_container.paging,
592 594
         unresponsive_engines=result_container.unresponsive_engines,
593
-        current_language=search_query.lang,
595
+        current_language=match_language(search_query.lang,
596
+                                        LANGUAGE_CODES,
597
+                                        fallback=settings['search']['language']),
594 598
         base_url=get_base_url(),
595 599
         theme=get_current_theme_name(),
596 600
         favicons=global_favicons[themes.index(get_current_theme_name())]
@@ -687,6 +691,10 @@ def preferences():
687 691
                              'warn_time': False}
688 692
             if e.timeout > settings['outgoing']['request_timeout']:
689 693
                 stats[e.name]['warn_timeout'] = True
694
+            if match_language(request.preferences.get_value('language'),
695
+                              getattr(e, 'supported_languages', []),
696
+                              getattr(e, 'language_aliases', {}), None):
697
+                stats[e.name]['supports_selected_language'] = True
690 698
 
691 699
     # get first element [0], the engine time,
692 700
     # and then the second element [1] : the time (the first one is the label)

+ 6
- 1
tests/unit/engines/test_archlinux.py Целия файл

@@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
19 19
         query = 'test_query'
20 20
         dic = defaultdict(dict)
21 21
         dic['pageno'] = 1
22
-        dic['language'] = 'en_US'
22
+        dic['language'] = 'en-US'
23 23
         params = archlinux.request(query, dic)
24 24
         self.assertTrue('url' in params)
25 25
         self.assertTrue(query in params['url'])
26 26
         self.assertTrue('wiki.archlinux.org' in params['url'])
27 27
 
28
+        for lang, name in archlinux.main_langs:
29
+            dic['language'] = lang
30
+            params = archlinux.request(query, dic)
31
+            self.assertTrue(name in params['url'])
32
+
28 33
         for lang, domain in domains.items():
29 34
             dic['language'] = lang
30 35
             params = archlinux.request(query, dic)

+ 1
- 0
tests/unit/engines/test_bing.py Целия файл

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestBingEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
10 11
         query = u'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 0

+ 0
- 1
tests/unit/engines/test_bing_images.py Целия файл

@@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         bing_images.supported_languages = ['fr-FR', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_bing_news.py Целия файл

@@ -8,10 +8,11 @@ import lxml
8 8
 class TestBingNewsEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        bing_news.supported_languages = ['en', 'fr']
11 12
         query = 'test_query'
12 13
         dicto = defaultdict(dict)
13 14
         dicto['pageno'] = 1
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15 16
         dicto['time_range'] = ''
16 17
         params = bing_news.request(query, dicto)
17 18
         self.assertIn('url', params)

+ 0
- 1
tests/unit/engines/test_bing_videos.py Целия файл

@@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         bing_videos.supported_languages = ['fr-FR', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_dailymotion.py Целия файл

@@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
8 8
 class TestDailymotionEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        dailymotion.supported_languages = ['en', 'fr']
11 12
         query = 'test_query'
12 13
         dicto = defaultdict(dict)
13 14
         dicto['pageno'] = 0
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15 16
         params = dailymotion.request(query, dicto)
16 17
         self.assertTrue('url' in params)
17 18
         self.assertTrue(query in params['url'])

+ 12
- 6
tests/unit/engines/test_duckduckgo.py Целия файл

@@ -1,18 +1,21 @@
1 1
 # -*- coding: utf-8 -*-
2 2
 from collections import defaultdict
3 3
 import mock
4
-from searx.engines import duckduckgo
4
+from searx.engines import load_engine, duckduckgo
5 5
 from searx.testing import SearxTestCase
6 6
 
7 7
 
8 8
 class TestDuckduckgoEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
12
+
11 13
         query = 'test_query'
12 14
         dicto = defaultdict(dict)
13 15
         dicto['pageno'] = 1
14
-        dicto['language'] = 'de-CH'
15 16
         dicto['time_range'] = ''
17
+
18
+        dicto['language'] = 'de-CH'
16 19
         params = duckduckgo.request(query, dicto)
17 20
         self.assertIn('url', params)
18 21
         self.assertIn(query, params['url'])
@@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
20 23
         self.assertIn('ch-de', params['url'])
21 24
         self.assertIn('s=0', params['url'])
22 25
 
23
-        # when ddg uses non standard code
26
+        # when ddg uses non standard codes
27
+        dicto['language'] = 'zh-HK'
28
+        params = duckduckgo.request(query, dicto)
29
+        self.assertIn('hk-tzh', params['url'])
30
+
24 31
         dicto['language'] = 'en-GB'
25 32
         params = duckduckgo.request(query, dicto)
26 33
         self.assertIn('uk-en', params['url'])
27 34
 
28 35
         # no country given
29
-        duckduckgo.supported_languages = ['de-CH', 'en-US']
30
-        dicto['language'] = 'de'
36
+        dicto['language'] = 'en'
31 37
         params = duckduckgo.request(query, dicto)
32
-        self.assertIn('ch-de', params['url'])
38
+        self.assertIn('us-en', params['url'])
33 39
 
34 40
     def test_no_url_in_request_year_time_range(self):
35 41
         dicto = defaultdict(dict)

+ 1
- 0
tests/unit/engines/test_duckduckgo_definitions.py Целия файл

@@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
18 18
         self.assertEqual(result, 'Text in link')
19 19
 
20 20
     def test_request(self):
21
+        duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
21 22
         query = 'test_query'
22 23
         dicto = defaultdict(dict)
23 24
         dicto['pageno'] = 1

+ 0
- 1
tests/unit/engines/test_duckduckgo_images.py Целия файл

@@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['is_test'] = True

+ 7
- 0
tests/unit/engines/test_google.py Целия файл

@@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
15 15
         return response
16 16
 
17 17
     def test_request(self):
18
+        google.supported_languages = ['en', 'fr', 'zh-CN']
19
+
18 20
         query = 'test_query'
19 21
         dicto = defaultdict(dict)
20 22
         dicto['pageno'] = 1
@@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
31 33
         self.assertIn('google.co', params['url'])
32 34
         self.assertIn('en', params['headers']['Accept-Language'])
33 35
 
36
+        dicto['language'] = 'zh'
37
+        params = google.request(query, dicto)
38
+        self.assertIn('google.com', params['url'])
39
+        self.assertIn('zh-CN', params['headers']['Accept-Language'])
40
+
34 41
     def test_response(self):
35 42
         self.assertRaises(AttributeError, google.response, None)
36 43
         self.assertRaises(AttributeError, google.response, [])

+ 1
- 0
tests/unit/engines/test_google_news.py Целия файл

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
9 9
 class TestGoogleNewsEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        google_news.supported_languages = ['en-US', 'fr-FR']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['pageno'] = 1

+ 1
- 1
tests/unit/engines/test_qwant.py Целия файл

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestQwantEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
10 11
         query = 'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 0
@@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
26 27
         self.assertIn('en_us', params['url'])
27 28
         self.assertIn('news', params['url'])
28 29
 
29
-        qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
30 30
         dicto['language'] = 'fr'
31 31
         params = qwant.request(query, dicto)
32 32
         self.assertIn('fr_fr', params['url'])

+ 1
- 0
tests/unit/engines/test_swisscows.py Целия файл

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestSwisscowsEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        swisscows.supported_languages = ['de-AT', 'de-DE']
10 11
         query = 'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 1

+ 1
- 0
tests/unit/engines/test_wikidata.py Целия файл

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
9 9
 class TestWikidataEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        wikidata.supported_languages = ['en', 'es']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['language'] = 'en-US'

+ 14
- 3
tests/unit/engines/test_yahoo.py Целия файл

@@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
25 25
         self.assertEqual('https://this.is.the.url/', url)
26 26
 
27 27
     def test_request(self):
28
+        yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
28 29
         query = 'test_query'
29 30
         dicto = defaultdict(dict)
30 31
         dicto['pageno'] = 1
31 32
         dicto['time_range'] = ''
32
-        dicto['language'] = 'fr_FR'
33
+        dicto['language'] = 'fr-FR'
33 34
         params = yahoo.request(query, dicto)
34 35
         self.assertIn('url', params)
35 36
         self.assertIn(query, params['url'])
@@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
39 40
         self.assertIn('sB', params['cookies'])
40 41
         self.assertIn('fr', params['cookies']['sB'])
41 42
 
43
+        dicto['language'] = 'zh'
44
+        params = yahoo.request(query, dicto)
45
+        self.assertIn('zh_chs', params['url'])
46
+        self.assertIn('zh_chs', params['cookies']['sB'])
47
+
48
+        dicto['language'] = 'zh-TW'
49
+        params = yahoo.request(query, dicto)
50
+        self.assertIn('zh_cht', params['url'])
51
+        self.assertIn('zh_cht', params['cookies']['sB'])
52
+
42 53
     def test_no_url_in_request_year_time_range(self):
43 54
         dicto = defaultdict(dict)
44 55
         query = 'test_query'
@@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
168 179
         self.assertEqual(type(languages), list)
169 180
         self.assertEqual(len(languages), 3)
170 181
         self.assertIn('ar', languages)
171
-        self.assertIn('zh-chs', languages)
172
-        self.assertIn('zh-cht', languages)
182
+        self.assertIn('zh-CHS', languages)
183
+        self.assertIn('zh-CHT', languages)

+ 2
- 1
tests/unit/engines/test_yahoo_news.py Целия файл

@@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
9 9
 class TestYahooNewsEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        yahoo_news.supported_languages = ['en', 'fr']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['pageno'] = 1
15
-        dicto['language'] = 'fr_FR'
16
+        dicto['language'] = 'fr-FR'
16 17
         params = yahoo_news.request(query, dicto)
17 18
         self.assertIn('url', params)
18 19
         self.assertIn(query, params['url'])

+ 25
- 0
tests/unit/test_utils.py Целия файл

@@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
65 65
         for test_url, expected in data:
66 66
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
67 67
 
68
+    def test_match_language(self):
69
+        self.assertEqual(utils.match_language('es', ['es']), 'es')
70
+        self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
71
+        self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
72
+
73
+        aliases = {'en-GB': 'en-UK', 'he': 'iw'}
74
+
75
+        # guess country
76
+        self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
77
+        self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
78
+        self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
79
+        self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
80
+        self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
81
+        self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
82
+
83
+        # language aliases
84
+        self.assertEqual(utils.match_language('iw', ['he']), 'he')
85
+        self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
86
+        self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
87
+        self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
88
+        self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
89
+        self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
90
+        self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
91
+        self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
92
+
68 93
 
69 94
 class TestHTMLTextExtractor(SearxTestCase):
70 95
 

+ 3
- 15
utils/fetch_languages.py Целия файл

@@ -19,19 +19,6 @@ from searx.engines import initialize_engines, engines
19 19
 engines_languages_file = 'engines_languages.json'
20 20
 languages_file = 'languages.py'
21 21
 
22
-# custom fixes for non standard locale codes
23
-# sl-SL is technically not invalid, but still a mistake
24
-# TODO: move to respective engines
25
-locale_fixes = {
26
-    'sl-sl': 'sl-SI',
27
-    'ar-xa': 'ar-SA',
28
-    'es-xl': 'es-419',
29
-    'zh-chs': 'zh-Hans-CN',
30
-    'zh-cht': 'zh-Hant-TW',
31
-    'tzh-tw': 'zh-Hant-TW',
32
-    'tzh-hk': 'zh-Hant-HK'
33
-}
34
-
35 22
 
36 23
 # Fetchs supported languages for each engine and writes json file with those.
37 24
 def fetch_supported_languages():
@@ -76,8 +63,9 @@ def join_language_lists(engines_languages):
76 63
         for lang_code in engines_languages[engine_name]:
77 64
 
78 65
             # apply custom fixes if necessary
79
-            if lang_code.lower() in locale_fixes:
80
-                lang_code = locale_fixes[lang_code.lower()]
66
+            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
67
+                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
68
+                                 if lang_code == alias)
81 69
 
82 70
             locale = get_locale(lang_code)
83 71