Browse Source

Merge branch 'recollengine' of github.com:Yetangitu/searx into recollengine

remote commit
frankdelange 6 years ago
parent
commit
c537a1921a
43 changed files with 420 additions and 312 deletions
  1. 1
    1
      searx/data/engines_languages.json
  2. 20
    1
      searx/engines/__init__.py
  3. 2
    2
      searx/engines/archlinux.py
  4. 4
    2
      searx/engines/bing.py
  5. 2
    21
      searx/engines/bing_images.py
  6. 3
    3
      searx/engines/bing_news.py
  7. 4
    3
      searx/engines/bing_videos.py
  8. 2
    1
      searx/engines/dailymotion.py
  9. 19
    30
      searx/engines/duckduckgo.py
  10. 4
    3
      searx/engines/duckduckgo_definitions.py
  11. 4
    1
      searx/engines/duckduckgo_images.py
  12. 15
    16
      searx/engines/google.py
  13. 4
    2
      searx/engines/google_news.py
  14. 3
    10
      searx/engines/qwant.py
  15. 3
    5
      searx/engines/swisscows.py
  16. 3
    2
      searx/engines/wikidata.py
  17. 2
    7
      searx/engines/wikipedia.py
  18. 12
    12
      searx/engines/yahoo.py
  19. 5
    2
      searx/engines/yahoo_news.py
  20. 16
    24
      searx/languages.py
  21. 0
    4
      searx/preferences.py
  22. 7
    3
      searx/query.py
  23. 2
    2
      searx/templates/oscar/preferences.html
  24. 61
    0
      searx/utils.py
  25. 15
    7
      searx/webapp.py
  26. 6
    1
      tests/unit/engines/test_archlinux.py
  27. 1
    0
      tests/unit/engines/test_bing.py
  28. 0
    1
      tests/unit/engines/test_bing_images.py
  29. 2
    1
      tests/unit/engines/test_bing_news.py
  30. 0
    1
      tests/unit/engines/test_bing_videos.py
  31. 2
    1
      tests/unit/engines/test_dailymotion.py
  32. 12
    6
      tests/unit/engines/test_duckduckgo.py
  33. 1
    0
      tests/unit/engines/test_duckduckgo_definitions.py
  34. 0
    1
      tests/unit/engines/test_duckduckgo_images.py
  35. 7
    0
      tests/unit/engines/test_google.py
  36. 1
    0
      tests/unit/engines/test_google_news.py
  37. 1
    1
      tests/unit/engines/test_qwant.py
  38. 1
    0
      tests/unit/engines/test_swisscows.py
  39. 1
    0
      tests/unit/engines/test_wikidata.py
  40. 14
    3
      tests/unit/engines/test_yahoo.py
  41. 2
    1
      tests/unit/engines/test_yahoo_news.py
  42. 25
    0
      tests/unit/test_utils.py
  43. 131
    131
      utils/fetch_languages.py

+ 1
- 1
searx/data/engines_languages.json
File diff suppressed because it is too large
View File


+ 20
- 1
searx/engines/__init__.py View File

@@ -20,13 +20,14 @@ import sys
20 20
 import threading
21 21
 from os.path import realpath, dirname
22 22
 from io import open
23
+from babel.localedata import locale_identifiers
23 24
 from flask_babel import gettext
24 25
 from operator import itemgetter
25 26
 from json import loads
26 27
 from requests import get
27 28
 from searx import settings
28 29
 from searx import logger
29
-from searx.utils import load_module
30
+from searx.utils import load_module, match_language
30 31
 
31 32
 
32 33
 logger = logger.getChild('engines')
@@ -38,6 +39,8 @@ engines = {}
38 39
 categories = {'general': []}
39 40
 
40 41
 languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
42
+babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
43
+               for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
41 44
 
42 45
 engine_shortcuts = {}
43 46
 engine_default_args = {'paging': False,
@@ -97,6 +100,22 @@ def load_engine(engine_data):
97 100
     if engine_data['name'] in languages:
98 101
         setattr(engine, 'supported_languages', languages[engine_data['name']])
99 102
 
103
+    # find custom aliases for non standard language codes
104
+    if hasattr(engine, 'supported_languages'):
105
+        if hasattr(engine, 'language_aliases'):
106
+            language_aliases = getattr(engine, 'language_aliases')
107
+        else:
108
+            language_aliases = {}
109
+
110
+        for engine_lang in getattr(engine, 'supported_languages'):
111
+            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
112
+            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
113
+               iso_lang not in getattr(engine, 'supported_languages'):
114
+                language_aliases[iso_lang] = engine_lang
115
+
116
+        if language_aliases:
117
+            setattr(engine, 'language_aliases', language_aliases)
118
+
100 119
     # assign language fetching method if auxiliary method exists
101 120
     if hasattr(engine, '_fetch_supported_languages'):
102 121
         setattr(engine, 'fetch_supported_languages',

+ 2
- 2
searx/engines/archlinux.py View File

@@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs)
99 99
 
100 100
 # do search-request
101 101
 def request(query, params):
102
-    # translate the locale (e.g. 'en_US') to language code ('en')
102
+    # translate the locale (e.g. 'en-US') to language code ('en')
103 103
     language = locale_to_lang_code(params['language'])
104 104
 
105 105
     # if our language is hosted on the main site, we need to add its name
106 106
     # to the query in order to narrow the results to that language
107 107
     if language in main_langs:
108
-        query += '(' + main_langs[language] + ')'
108
+        query += b' (' + main_langs[language] + b')'
109 109
 
110 110
     # prepare the request parameters
111 111
     query = urlencode({'search': query})

+ 4
- 2
searx/engines/bing.py View File

@@ -16,12 +16,14 @@
16 16
 from lxml import html
17 17
 from searx.engines.xpath import extract_text
18 18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19 20
 
20 21
 # engine dependent config
21 22
 categories = ['general']
22 23
 paging = True
23 24
 language_support = True
24 25
 supported_languages_url = 'https://www.bing.com/account/general'
26
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
25 27
 
26 28
 # search-url
27 29
 base_url = 'https://www.bing.com/'
@@ -32,9 +34,9 @@ search_string = 'search?{query}&first={offset}'
32 34
 def request(query, params):
33 35
     offset = (params['pageno'] - 1) * 10 + 1
34 36
 
35
-    lang = params['language'].split('-')[0].upper()
37
+    lang = match_language(params['language'], supported_languages, language_aliases)
36 38
 
37
-    query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
39
+    query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
38 40
 
39 41
     search_path = search_string.format(
40 42
         query=urlencode({'q': query}),

+ 2
- 21
searx/engines/bing_images.py View File

@@ -19,6 +19,7 @@ from lxml import html
19 19
 from json import loads
20 20
 import re
21 21
 from searx.url_utils import urlencode
22
+from searx.utils import match_language
22 23
 
23 24
 # engine dependent config
24 25
 categories = ['images']
@@ -46,26 +47,6 @@ safesearch_types = {2: 'STRICT',
46 47
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
47 48
 
48 49
 
49
-# get supported region code
50
-def get_region_code(lang, lang_list=None):
51
-    region = None
52
-    if lang in (lang_list or supported_languages):
53
-        region = lang
54
-    elif lang.startswith('no'):
55
-        region = 'nb-NO'
56
-    else:
57
-        # try to get a supported country code with language
58
-        lang = lang.split('-')[0]
59
-        for lc in (lang_list or supported_languages):
60
-            if lang == lc.split('-')[0]:
61
-                region = lc
62
-                break
63
-    if region:
64
-        return region.lower()
65
-    else:
66
-        return 'en-us'
67
-
68
-
69 50
 # do search-request
70 51
 def request(query, params):
71 52
     offset = (params['pageno'] - 1) * 10 + 1
@@ -74,7 +55,7 @@ def request(query, params):
74 55
         query=urlencode({'q': query}),
75 56
         offset=offset)
76 57
 
77
-    language = get_region_code(params['language'])
58
+    language = match_language(params['language'], supported_languages).lower()
78 59
 
79 60
     params['cookies']['SRCHHPGUSR'] = \
80 61
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

+ 3
- 3
searx/engines/bing_news.py View File

@@ -14,8 +14,8 @@
14 14
 from datetime import datetime
15 15
 from dateutil import parser
16 16
 from lxml import etree
17
-from searx.utils import list_get
18
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
17
+from searx.utils import list_get, match_language
18
+from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
19 19
 from searx.url_utils import urlencode, urlparse, parse_qsl
20 20
 
21 21
 # engine dependent config
@@ -71,7 +71,7 @@ def request(query, params):
71 71
 
72 72
     offset = (params['pageno'] - 1) * 10 + 1
73 73
 
74
-    language = params['language']
74
+    language = match_language(params['language'], supported_languages, language_aliases)
75 75
 
76 76
     params['url'] = _get_url(query, language, offset, params['time_range'])
77 77
 

+ 4
- 3
searx/engines/bing_videos.py View File

@@ -12,9 +12,10 @@
12 12
 
13 13
 from json import loads
14 14
 from lxml import html
15
-from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
15
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
16 16
 from searx.engines.xpath import extract_text
17 17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18 19
 
19 20
 
20 21
 categories = ['videos']
@@ -47,8 +48,8 @@ def request(query, params):
47 48
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
48 49
 
49 50
     # language cookie
50
-    region = get_region_code(params['language'], lang_list=supported_languages)
51
-    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
51
+    language = match_language(params['language'], supported_languages).lower()
52
+    params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
52 53
 
53 54
     # query and paging
54 55
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 2
- 1
searx/engines/dailymotion.py View File

@@ -15,6 +15,7 @@
15 15
 from json import loads
16 16
 from datetime import datetime
17 17
 from searx.url_utils import urlencode
18
+from searx.utils import match_language
18 19
 
19 20
 # engine dependent config
20 21
 categories = ['videos']
@@ -32,7 +33,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages'
32 33
 
33 34
 # do search-request
34 35
 def request(query, params):
35
-    locale = params['language']
36
+    locale = match_language(params['language'], supported_languages)
36 37
 
37 38
     params['url'] = search_url.format(
38 39
         query=urlencode({'search': query, 'localization': locale}),

+ 19
- 30
searx/engines/duckduckgo.py View File

@@ -18,14 +18,25 @@ from json import loads
18 18
 from searx.engines.xpath import extract_text
19 19
 from searx.poolrequests import get
20 20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
21 22
 
22 23
 # engine dependent config
23 24
 categories = ['general']
24 25
 paging = True
25 26
 language_support = True
26
-supported_languages_url = 'https://duckduckgo.com/d2030.js'
27
+supported_languages_url = 'https://duckduckgo.com/util/u172.js'
27 28
 time_range_support = True
28 29
 
30
+language_aliases = {
31
+    'ar-SA': 'ar-XA',
32
+    'es-419': 'es-XL',
33
+    'ja': 'jp-JP',
34
+    'ko': 'kr-KR',
35
+    'sl-SI': 'sl-SL',
36
+    'zh-TW': 'tzh-TW',
37
+    'zh-HK': 'tzh-HK'
38
+}
39
+
29 40
 # search-url
30 41
 url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
31 42
 time_range_url = '&df={range}'
@@ -42,34 +53,12 @@ content_xpath = './/a[@class="result__snippet"]'
42 53
 
43 54
 
44 55
 # match query's language to a region code that duckduckgo will accept
45
-def get_region_code(lang, lang_list=None):
46
-    # custom fixes for languages
47
-    if lang[:2] == 'ja':
48
-        region_code = 'jp-jp'
49
-    elif lang[:2] == 'sl':
50
-        region_code = 'sl-sl'
51
-    elif lang == 'zh-TW':
52
-        region_code = 'tw-tzh'
53
-    elif lang == 'zh-HK':
54
-        region_code = 'hk-tzh'
55
-    elif lang[-2:] == 'SA':
56
-        region_code = 'xa-' + lang.split('-')[0]
57
-    elif lang[-2:] == 'GB':
58
-        region_code = 'uk-' + lang.split('-')[0]
59
-    else:
60
-        region_code = lang.split('-')
61
-        if len(region_code) == 2:
62
-            # country code goes first
63
-            region_code = region_code[1].lower() + '-' + region_code[0].lower()
64
-        else:
65
-            # tries to get a country code from language
66
-            region_code = region_code[0].lower()
67
-            for lc in (lang_list or supported_languages):
68
-                lc = lc.split('-')
69
-                if region_code == lc[0]:
70
-                    region_code = lc[1].lower() + '-' + lc[0].lower()
71
-                    break
72
-    return region_code
56
+def get_region_code(lang, lang_list=[]):
57
+    lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
58
+    lang_parts = lang_code.split('-')
59
+
60
+    # country code goes first
61
+    return lang_parts[1].lower() + '-' + lang_parts[0].lower()
73 62
 
74 63
 
75 64
 # do search-request
@@ -79,7 +68,7 @@ def request(query, params):
79 68
 
80 69
     offset = (params['pageno'] - 1) * 30
81 70
 
82
-    region_code = get_region_code(params['language'])
71
+    region_code = get_region_code(params['language'], supported_languages)
83 72
     params['url'] = url.format(
84 73
         query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
85 74
 

+ 4
- 3
searx/engines/duckduckgo_definitions.py View File

@@ -2,9 +2,9 @@ import json
2 2
 from lxml import html
3 3
 from re import compile
4 4
 from searx.engines.xpath import extract_text
5
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
5
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
6 6
 from searx.url_utils import urlencode
7
-from searx.utils import html_to_text
7
+from searx.utils import html_to_text, match_language
8 8
 
9 9
 url = 'https://api.duckduckgo.com/'\
10 10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
@@ -24,7 +24,8 @@ def result_to_text(url, text, htmlResult):
24 24
 
25 25
 def request(query, params):
26 26
     params['url'] = url.format(query=urlencode({'q': query}))
27
-    params['headers']['Accept-Language'] = params['language'].split('-')[0]
27
+    language = match_language(params['language'], supported_languages, language_aliases)
28
+    params['headers']['Accept-Language'] = language.split('-')[0]
28 29
     return params
29 30
 
30 31
 

+ 4
- 1
searx/engines/duckduckgo_images.py View File

@@ -15,7 +15,10 @@
15 15
 
16 16
 from json import loads
17 17
 from searx.engines.xpath import extract_text
18
-from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
18
+from searx.engines.duckduckgo import (
19
+    _fetch_supported_languages, supported_languages_url,
20
+    get_region_code, language_aliases
21
+)
19 22
 from searx.poolrequests import get
20 23
 from searx.url_utils import urlencode
21 24
 

+ 15
- 16
searx/engines/google.py View File

@@ -14,6 +14,7 @@ from lxml import html, etree
14 14
 from searx.engines.xpath import extract_text, extract_url
15 15
 from searx import logger
16 16
 from searx.url_utils import urlencode, urlparse, parse_qsl
17
+from searx.utils import match_language
17 18
 
18 19
 logger = logger.getChild('google engine')
19 20
 
@@ -72,7 +73,7 @@ country_to_hostname = {
72 73
     'RO': 'www.google.ro',  # Romania
73 74
     'RU': 'www.google.ru',  # Russia
74 75
     'SK': 'www.google.sk',  # Slovakia
75
-    'SL': 'www.google.si',  # Slovenia (SL -> si)
76
+    'SI': 'www.google.si',  # Slovenia
76 77
     'SE': 'www.google.se',  # Sweden
77 78
     'TH': 'www.google.co.th',  # Thailand
78 79
     'TR': 'www.google.com.tr',  # Turkey
@@ -165,22 +166,20 @@ def extract_text_from_dom(result, xpath):
165 166
 def request(query, params):
166 167
     offset = (params['pageno'] - 1) * 10
167 168
 
169
+    language = match_language(params['language'], supported_languages)
170
+    language_array = language.split('-')
171
+    if params['language'].find('-') > 0:
172
+        country = params['language'].split('-')[1]
173
+    elif len(language_array) == 2:
174
+        country = language_array[1]
175
+    else:
176
+        country = 'US'
177
+
168 178
     # temporary fix until a way of supporting en-US is found
169
-    if params['language'] == 'en-US':
170
-        params['language'] = 'en-GB'
179
+    if language == 'en-US':
180
+        country = 'GB'
171 181
 
172
-    if params['language'][:2] == 'jv':
173
-        language = 'jw'
174
-        country = 'ID'
175
-        url_lang = 'lang_jw'
176
-    else:
177
-        language_array = params['language'].lower().split('-')
178
-        if len(language_array) == 2:
179
-            country = language_array[1]
180
-        else:
181
-            country = 'US'
182
-        language = language_array[0] + ',' + language_array[0] + '-' + country
183
-        url_lang = 'lang_' + language_array[0]
182
+    url_lang = 'lang_' + language
184 183
 
185 184
     if use_locale_domain:
186 185
         google_hostname = country_to_hostname.get(country.upper(), default_hostname)
@@ -196,7 +195,7 @@ def request(query, params):
196 195
     if params['time_range'] in time_range_dict:
197 196
         params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
198 197
 
199
-    params['headers']['Accept-Language'] = language
198
+    params['headers']['Accept-Language'] = language + ',' + language + '-' + country
200 199
     params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
201 200
 
202 201
     params['google_hostname'] = google_hostname

+ 4
- 2
searx/engines/google_news.py View File

@@ -13,6 +13,7 @@
13 13
 from lxml import html
14 14
 from searx.engines.google import _fetch_supported_languages, supported_languages_url
15 15
 from searx.url_utils import urlencode
16
+from searx.utils import match_language
16 17
 
17 18
 # search-url
18 19
 categories = ['news']
@@ -50,8 +51,9 @@ def request(query, params):
50 51
     params['url'] = search_url.format(query=urlencode({'q': query}),
51 52
                                       search_options=urlencode(search_options))
52 53
 
53
-    language_array = params['language'].lower().split('-')
54
-    params['url'] += '&lr=lang_' + language_array[0]
54
+    language = match_language(params['language'], supported_languages).split('-')[0]
55
+    if language:
56
+        params['url'] += '&lr=lang_' + language
55 57
 
56 58
     return params
57 59
 

+ 3
- 10
searx/engines/qwant.py View File

@@ -14,6 +14,7 @@ from datetime import datetime
14 14
 from json import loads
15 15
 from searx.utils import html_to_text
16 16
 from searx.url_utils import urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = None
@@ -45,16 +46,8 @@ def request(query, params):
45 46
                                    offset=offset)
46 47
 
47 48
     # add language tag
48
-    if params['language'] == 'no' or params['language'].startswith('no-'):
49
-        params['language'] = params['language'].replace('no', 'nb', 1)
50
-    if params['language'].find('-') < 0:
51
-        # tries to get a country code from language
52
-        for lang in supported_languages:
53
-            lc = lang.split('-')
54
-            if params['language'] == lc[0]:
55
-                params['language'] = lang
56
-                break
57
-    params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
49
+    language = match_language(params['language'], supported_languages)
50
+    params['url'] += '&locale=' + language.replace('-', '_').lower()
58 51
 
59 52
     return params
60 53
 

+ 3
- 5
searx/engines/swisscows.py View File

@@ -14,6 +14,7 @@ from json import loads
14 14
 import re
15 15
 from lxml.html import fromstring
16 16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = ['general', 'images']
@@ -35,11 +36,8 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
35 36
 
36 37
 # do search-request
37 38
 def request(query, params):
38
-    if params['language'].split('-')[0] == 'no':
39
-        region = 'nb-NO'
40
-    else:
41
-        region = params['language']
42
-        ui_language = params['language'].split('-')[0]
39
+    region = match_language(params['language'], supported_languages)
40
+    ui_language = region.split('-')[0]
43 41
 
44 42
     search_path = search_string.format(
45 43
         query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

+ 3
- 2
searx/engines/wikidata.py View File

@@ -16,6 +16,7 @@ from searx.poolrequests import get
16 16
 from searx.engines.xpath import extract_text
17 17
 from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
18 18
 from searx.url_utils import urlencode
19
+from searx.utils import match_language
19 20
 
20 21
 from json import loads
21 22
 from lxml.html import fromstring
@@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
56 57
 
57 58
 
58 59
 def request(query, params):
59
-    language = params['language'].split('-')[0]
60
+    language = match_language(params['language'], supported_languages).split('-')[0]
60 61
 
61 62
     params['url'] = url_search.format(
62 63
         query=urlencode({'label': query, 'language': language}))
@@ -68,7 +69,7 @@ def response(resp):
68 69
     html = fromstring(resp.text)
69 70
     wikidata_ids = html.xpath(wikidata_ids_xpath)
70 71
 
71
-    language = resp.search_params['language'].split('-')[0]
72
+    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
72 73
 
73 74
     # TODO: make requests asynchronous to avoid timeout when result_count > 1
74 75
     for wikidata_id in wikidata_ids[:result_count]:

+ 2
- 7
searx/engines/wikipedia.py View File

@@ -13,6 +13,7 @@
13 13
 from json import loads
14 14
 from lxml.html import fromstring
15 15
 from searx.url_utils import quote, urlencode
16
+from searx.utils import match_language
16 17
 
17 18
 # search-url
18 19
 base_url = u'https://{language}.wikipedia.org/'
@@ -30,13 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
30 31
 
31 32
 # set language in base_url
32 33
 def url_lang(lang):
33
-    lang = lang.split('-')[0]
34
-    if lang not in supported_languages:
35
-        language = 'en'
36
-    else:
37
-        language = lang
38
-
39
-    return language
34
+    return match_language(lang, supported_languages).split('-')[0]
40 35
 
41 36
 
42 37
 # do search-request

+ 12
- 12
searx/engines/yahoo.py View File

@@ -14,6 +14,7 @@
14 14
 from lxml import html
15 15
 from searx.engines.xpath import extract_text, extract_url
16 16
 from searx.url_utils import unquote, urlencode
17
+from searx.utils import match_language
17 18
 
18 19
 # engine dependent config
19 20
 categories = ['general']
@@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'],
39 40
                    'week': ['1w', 'w'],
40 41
                    'month': ['1m', 'm']}
41 42
 
43
+language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
44
+
42 45
 
43 46
 # remove yahoo-specific tracking-url
44 47
 def parse_url(url_string):
@@ -70,23 +73,16 @@ def _get_url(query, offset, language, time_range):
70 73
                                         lang=language)
71 74
 
72 75
 
73
-def _get_language(params):
74
-    if params['language'][:2] == 'zh':
75
-        if params['language'] == 'zh' or params['language'] == 'zh-CH':
76
-            return 'szh'
77
-        else:
78
-            return 'tzh'
79
-    else:
80
-        return params['language'].split('-')[0]
81
-
82
-
83 76
 # do search-request
84 77
 def request(query, params):
85 78
     if params['time_range'] and params['time_range'] not in time_range_dict:
86 79
         return params
87 80
 
88 81
     offset = (params['pageno'] - 1) * 10 + 1
89
-    language = _get_language(params)
82
+    language = match_language(params['language'], supported_languages, language_aliases)
83
+    if language not in language_aliases.values():
84
+        language = language.split('-')[0]
85
+    language = language.replace('-', '_').lower()
90 86
 
91 87
     params['url'] = _get_url(query, offset, language, params['time_range'])
92 88
 
@@ -145,7 +141,11 @@ def _fetch_supported_languages(resp):
145 141
     dom = html.fromstring(resp.text)
146 142
     options = dom.xpath('//div[@id="yschlang"]/span/label/input')
147 143
     for option in options:
148
-        code = option.xpath('./@value')[0][5:].replace('_', '-')
144
+        code_parts = option.xpath('./@value')[0][5:].split('_')
145
+        if len(code_parts) == 2:
146
+            code = code_parts[0] + '-' + code_parts[1].upper()
147
+        else:
148
+            code = code_parts[0]
149 149
         supported_languages.append(code)
150 150
 
151 151
     return supported_languages

+ 5
- 2
searx/engines/yahoo_news.py View File

@@ -13,9 +13,12 @@ import re
13 13
 from datetime import datetime, timedelta
14 14
 from lxml import html
15 15
 from searx.engines.xpath import extract_text, extract_url
16
-from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
16
+from searx.engines.yahoo import (
17
+    parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
18
+)
17 19
 from dateutil import parser
18 20
 from searx.url_utils import urlencode
21
+from searx.utils import match_language
19 22
 
20 23
 # engine dependent config
21 24
 categories = ['news']
@@ -38,7 +41,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
38 41
 def request(query, params):
39 42
     offset = (params['pageno'] - 1) * 10 + 1
40 43
 
41
-    language = params['language'].split('-')[0]
44
+    language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
42 45
 
43 46
     params['url'] = search_url.format(offset=offset,
44 47
                                       query=urlencode({'p': query}),

+ 16
- 24
searx/languages.py View File

@@ -5,11 +5,7 @@
5 5
 language_codes = (
6 6
     (u"ar-SA", u"العربية", u"", u"Arabic"),
7 7
     (u"bg-BG", u"Български", u"", u"Bulgarian"),
8
-    (u"ca", u"Català", u"", u"Catalan"),
9
-    (u"ca-AD", u"Català", u"Andorra", u"Catalan"),
10
-    (u"ca-CT", u"Català", u"", u"Catalan"),
11
-    (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
12
-    (u"ca-FR", u"Català", u"França", u"Catalan"),
8
+    (u"ca-ES", u"Català", u"", u"Catalan"),
13 9
     (u"cs-CZ", u"Čeština", u"", u"Czech"),
14 10
     (u"da-DK", u"Dansk", u"", u"Danish"),
15 11
     (u"de", u"Deutsch", u"", u"German"),
@@ -21,55 +17,51 @@ language_codes = (
21 17
     (u"en-AU", u"English", u"Australia", u"English"),
22 18
     (u"en-CA", u"English", u"Canada", u"English"),
23 19
     (u"en-GB", u"English", u"United Kingdom", u"English"),
24
-    (u"en-ID", u"English", u"Indonesia", u"English"),
25
-    (u"en-IE", u"English", u"Ireland", u"English"),
26 20
     (u"en-IN", u"English", u"India", u"English"),
27 21
     (u"en-MY", u"English", u"Malaysia", u"English"),
28
-    (u"en-NZ", u"English", u"New Zealand", u"English"),
29
-    (u"en-PH", u"English", u"Philippines", u"English"),
30
-    (u"en-SG", u"English", u"Singapore", u"English"),
31 22
     (u"en-US", u"English", u"United States", u"English"),
32
-    (u"en-ZA", u"English", u"South Africa", u"English"),
33 23
     (u"es", u"Español", u"", u"Spanish"),
34
-    (u"es-AD", u"Español", u"Andorra", u"Spanish"),
35 24
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
36
-    (u"es-CL", u"Español", u"Chile", u"Spanish"),
37
-    (u"es-CO", u"Español", u"Colombia", u"Spanish"),
38 25
     (u"es-ES", u"Español", u"España", u"Spanish"),
39 26
     (u"es-MX", u"Español", u"México", u"Spanish"),
40
-    (u"es-PE", u"Español", u"Perú", u"Spanish"),
41
-    (u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
42 27
     (u"et-EE", u"Eesti", u"", u"Estonian"),
28
+    (u"fa-IR", u"فارسی", u"", u"Persian"),
43 29
     (u"fi-FI", u"Suomi", u"", u"Finnish"),
44 30
     (u"fr", u"Français", u"", u"French"),
45
-    (u"fr-AD", u"Français", u"Andorre", u"French"),
46 31
     (u"fr-BE", u"Français", u"Belgique", u"French"),
47 32
     (u"fr-CA", u"Français", u"Canada", u"French"),
48 33
     (u"fr-CH", u"Français", u"Suisse", u"French"),
49 34
     (u"fr-FR", u"Français", u"France", u"French"),
50 35
     (u"he-IL", u"עברית", u"", u"Hebrew"),
36
+    (u"hr-HR", u"Hrvatski", u"", u"Croatian"),
51 37
     (u"hu-HU", u"Magyar", u"", u"Hungarian"),
52
-    (u"it", u"Italiano", u"", u"Italian"),
53
-    (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
54
-    (u"it-IT", u"Italiano", u"Italia", u"Italian"),
38
+    (u"id-ID", u"Indonesia", u"", u"Indonesian"),
39
+    (u"is-IS", u"Íslenska", u"", u"Icelandic"),
40
+    (u"it-IT", u"Italiano", u"", u"Italian"),
55 41
     (u"ja-JP", u"日本語", u"", u"Japanese"),
56 42
     (u"ko-KR", u"한국어", u"", u"Korean"),
43
+    (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
44
+    (u"lv-LV", u"Latviešu", u"", u"Latvian"),
45
+    (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
46
+    (u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
57 47
     (u"nl", u"Nederlands", u"", u"Dutch"),
58 48
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
59 49
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
60
-    (u"no-NO", u"Norsk", u"", u"Norwegian"),
61 50
     (u"pl-PL", u"Polski", u"", u"Polish"),
62 51
     (u"pt", u"Português", u"", u"Portuguese"),
63
-    (u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
64 52
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
65 53
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
66 54
     (u"ro-RO", u"Română", u"", u"Romanian"),
67 55
     (u"ru-RU", u"Русский", u"", u"Russian"),
56
+    (u"sk-SK", u"Slovenčina", u"", u"Slovak"),
57
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
58
+    (u"sr-RS", u"Српски", u"", u"Serbian"),
68 59
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
69 60
     (u"th-TH", u"ไทย", u"", u"Thai"),
70 61
     (u"tr-TR", u"Türkçe", u"", u"Turkish"),
62
+    (u"uk-UA", u"Українська", u"", u"Ukrainian"),
63
+    (u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
71 64
     (u"zh", u"中文", u"", u"Chinese"),
72 65
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
73
-    (u"zh-HK", u"中文", u"香港", u"Chinese"),
74
-    (u"zh-TW", u"中文", u"台湾", u"Chinese")
66
+    (u"zh-TW", u"中文", u"台灣", u"Chinese")
75 67
 )

+ 0
- 4
searx/preferences.py View File

@@ -115,10 +115,6 @@ class SearchLanguageSetting(EnumStringSetting):
115 115
                 pass
116 116
             elif lang in self.choices:
117 117
                 data = lang
118
-            elif data == 'nb-NO':
119
-                data = 'no-NO'
120
-            elif data == 'ar-XA':
121
-                data = 'ar-SA'
122 118
             else:
123 119
                 data = self.value
124 120
         self.value = data

+ 7
- 3
searx/query.py View File

@@ -96,9 +96,13 @@ class RawTextQuery(object):
96 96
                                 break
97 97
 
98 98
                 # user may set a valid, yet not selectable language
99
-                if not self.languages and VALID_LANGUAGE_CODE.match(lang):
100
-                    self.languages.append(lang)
101
-                    parse_next = True
99
+                if VALID_LANGUAGE_CODE.match(lang):
100
+                    lang_parts = lang.split('-')
101
+                    if len(lang_parts) > 1:
102
+                        lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
103
+                    if lang not in self.languages:
104
+                        self.languages.append(lang)
105
+                        parse_next = True
102 106
 
103 107
             # this force a engine or category
104 108
             if query_part[0] == '!' or query_part[0] == '?':

+ 2
- 2
searx/templates/oscar/preferences.html View File

@@ -187,7 +187,7 @@
187 187
                                     </td>
188 188
                                     <th>{{ search_engine.name }}</th>
189 189
 				    <td class="name">{{ shortcuts[search_engine.name] }}</td>
190
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
190
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
191 191
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
192 192
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
193 193
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
@@ -197,7 +197,7 @@
197 197
 					<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
198 198
 					<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
199 199
 					<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
200
-					<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
200
+					<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
201 201
 					<td>{{ shortcuts[search_engine.name] }}</td>
202 202
                                     <th>{{ search_engine.name }}</th>
203 203
                                     <td class="onoff-checkbox">

+ 61
- 0
searx/utils.py View File

@@ -4,6 +4,7 @@ import hmac
4 4
 import os
5 5
 import re
6 6
 
7
+from babel.core import get_global
7 8
 from babel.dates import format_date
8 9
 from codecs import getincrementalencoder
9 10
 from imp import load_source
@@ -12,6 +13,7 @@ from os.path import splitext, join
12 13
 from random import choice
13 14
 import sys
14 15
 
16
+from searx import settings
15 17
 from searx.version import VERSION_STRING
16 18
 from searx.languages import language_codes
17 19
 from searx import settings
@@ -322,6 +324,65 @@ def is_valid_lang(lang):
322 324
         return False
323 325
 
324 326
 
327
+# auxiliary function to match lang_code in lang_list
328
+def _match_language(lang_code, lang_list=[], custom_aliases={}):
329
+    # replace language code with a custom alias if necessary
330
+    if lang_code in custom_aliases:
331
+        lang_code = custom_aliases[lang_code]
332
+
333
+    if lang_code in lang_list:
334
+        return lang_code
335
+
336
+    # try to get the most likely country for this language
337
+    subtags = get_global('likely_subtags').get(lang_code)
338
+    if subtags:
339
+        subtag_parts = subtags.split('_')
340
+        new_code = subtag_parts[0] + '-' + subtag_parts[-1]
341
+        if new_code in custom_aliases:
342
+            new_code = custom_aliases[new_code]
343
+        if new_code in lang_list:
344
+            return new_code
345
+
346
+    # try to get the any supported country for this language
347
+    for lc in lang_list:
348
+        if lang_code == lc.split('-')[0]:
349
+            return lc
350
+
351
+    return None
352
+
353
+
354
+# get the language code from lang_list that best matches locale_code
355
+def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
356
+    # try to get language from given locale_code
357
+    language = _match_language(locale_code, lang_list, custom_aliases)
358
+    if language:
359
+        return language
360
+
361
+    locale_parts = locale_code.split('-')
362
+    lang_code = locale_parts[0]
363
+
364
+    # try to get language using an equivalent country code
365
+    if len(locale_parts) > 1:
366
+        country_alias = get_global('territory_aliases').get(locale_parts[-1])
367
+        if country_alias:
368
+            language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
369
+            if language:
370
+                return language
371
+
372
+    # try to get language using an equivalent language code
373
+    alias = get_global('language_aliases').get(lang_code)
374
+    if alias:
375
+        language = _match_language(alias, lang_list, custom_aliases)
376
+        if language:
377
+            return language
378
+
379
+    if lang_code != locale_code:
380
+        # try to get language from given language without giving the country
381
+        language = _match_language(lang_code, lang_list, custom_aliases)
382
+
383
+    return language or fallback
384
+
385
+
325 386
 def load_module(filename, module_dir):
326 387
     modname = splitext(filename)[0]
327 388
     if modname in sys.modules:

+ 15
- 7
searx/webapp.py View File

@@ -58,16 +58,16 @@ from searx.engines import (
58 58
 from searx.utils import (
59 59
     UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
60 60
     get_static_files, get_result_templates, get_themes, gen_useragent,
61
-    dict_subset, prettify_url
61
+    dict_subset, prettify_url, match_language
62 62
 )
63 63
 from searx.version import VERSION_STRING
64
-from searx.languages import language_codes
64
+from searx.languages import language_codes as languages
65 65
 from searx.search import SearchWithPlugins, get_search_query_from_webapp
66 66
 from searx.query import RawTextQuery
67 67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
68 68
 from searx.plugins import plugins
69 69
 from searx.plugins.oa_doi_rewrite import get_doi_resolver
70
-from searx.preferences import Preferences, ValidationException
70
+from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
71 71
 from searx.answerers import answerers
72 72
 from searx.url_utils import urlencode, urlparse, urljoin
73 73
 from searx.utils import new_hmac
@@ -133,7 +133,7 @@ if not searx_debug \
133 133
 babel = Babel(app)
134 134
 
135 135
 rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
136
-               'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
136
+               'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
137 137
 
138 138
 # used when translating category names
139 139
 _category_names = (gettext('files'),
@@ -352,9 +352,11 @@ def render(template_name, override_theme=None, **kwargs):
352 352
 
353 353
     kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
354 354
 
355
-    kwargs['language_codes'] = language_codes
355
+    kwargs['language_codes'] = languages
356 356
     if 'current_language' not in kwargs:
357
-        kwargs['current_language'] = request.preferences.get_value('language')
357
+        kwargs['current_language'] = match_language(request.preferences.get_value('language'),
358
+                                                    LANGUAGE_CODES,
359
+                                                    fallback=settings['search']['language'])
358 360
 
359 361
     # override url_for function in templates
360 362
     kwargs['url_for'] = url_for_theme
@@ -590,7 +592,9 @@ def index():
590 592
         infoboxes=result_container.infoboxes,
591 593
         paging=result_container.paging,
592 594
         unresponsive_engines=result_container.unresponsive_engines,
593
-        current_language=search_query.lang,
595
+        current_language=match_language(search_query.lang,
596
+                                        LANGUAGE_CODES,
597
+                                        fallback=settings['search']['language']),
594 598
         base_url=get_base_url(),
595 599
         theme=get_current_theme_name(),
596 600
         favicons=global_favicons[themes.index(get_current_theme_name())]
@@ -687,6 +691,10 @@ def preferences():
687 691
                              'warn_time': False}
688 692
             if e.timeout > settings['outgoing']['request_timeout']:
689 693
                 stats[e.name]['warn_timeout'] = True
694
+            if match_language(request.preferences.get_value('language'),
695
+                              getattr(e, 'supported_languages', []),
696
+                              getattr(e, 'language_aliases', {}), None):
697
+                stats[e.name]['supports_selected_language'] = True
690 698
 
691 699
     # get first element [0], the engine time,
692 700
     # and then the second element [1] : the time (the first one is the label)

+ 6
- 1
tests/unit/engines/test_archlinux.py View File

@@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
19 19
         query = 'test_query'
20 20
         dic = defaultdict(dict)
21 21
         dic['pageno'] = 1
22
-        dic['language'] = 'en_US'
22
+        dic['language'] = 'en-US'
23 23
         params = archlinux.request(query, dic)
24 24
         self.assertTrue('url' in params)
25 25
         self.assertTrue(query in params['url'])
26 26
         self.assertTrue('wiki.archlinux.org' in params['url'])
27 27
 
28
+        for lang, name in archlinux.main_langs:
29
+            dic['language'] = lang
30
+            params = archlinux.request(query, dic)
31
+            self.assertTrue(name in params['url'])
32
+
28 33
         for lang, domain in domains.items():
29 34
             dic['language'] = lang
30 35
             params = archlinux.request(query, dic)

+ 1
- 0
tests/unit/engines/test_bing.py View File

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestBingEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
10 11
         query = u'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 0

+ 0
- 1
tests/unit/engines/test_bing_images.py View File

@@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         bing_images.supported_languages = ['fr-FR', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_bing_news.py View File

@@ -8,10 +8,11 @@ import lxml
8 8
 class TestBingNewsEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        bing_news.supported_languages = ['en', 'fr']
11 12
         query = 'test_query'
12 13
         dicto = defaultdict(dict)
13 14
         dicto['pageno'] = 1
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15 16
         dicto['time_range'] = ''
16 17
         params = bing_news.request(query, dicto)
17 18
         self.assertIn('url', params)

+ 0
- 1
tests/unit/engines/test_bing_videos.py View File

@@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         bing_videos.supported_languages = ['fr-FR', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['pageno'] = 1

+ 2
- 1
tests/unit/engines/test_dailymotion.py View File

@@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
8 8
 class TestDailymotionEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        dailymotion.supported_languages = ['en', 'fr']
11 12
         query = 'test_query'
12 13
         dicto = defaultdict(dict)
13 14
         dicto['pageno'] = 0
14
-        dicto['language'] = 'fr_FR'
15
+        dicto['language'] = 'fr-FR'
15 16
         params = dailymotion.request(query, dicto)
16 17
         self.assertTrue('url' in params)
17 18
         self.assertTrue(query in params['url'])

+ 12
- 6
tests/unit/engines/test_duckduckgo.py View File

@@ -1,18 +1,21 @@
1 1
 # -*- coding: utf-8 -*-
2 2
 from collections import defaultdict
3 3
 import mock
4
-from searx.engines import duckduckgo
4
+from searx.engines import load_engine, duckduckgo
5 5
 from searx.testing import SearxTestCase
6 6
 
7 7
 
8 8
 class TestDuckduckgoEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
12
+
11 13
         query = 'test_query'
12 14
         dicto = defaultdict(dict)
13 15
         dicto['pageno'] = 1
14
-        dicto['language'] = 'de-CH'
15 16
         dicto['time_range'] = ''
17
+
18
+        dicto['language'] = 'de-CH'
16 19
         params = duckduckgo.request(query, dicto)
17 20
         self.assertIn('url', params)
18 21
         self.assertIn(query, params['url'])
@@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
20 23
         self.assertIn('ch-de', params['url'])
21 24
         self.assertIn('s=0', params['url'])
22 25
 
23
-        # when ddg uses non standard code
26
+        # when ddg uses non standard codes
27
+        dicto['language'] = 'zh-HK'
28
+        params = duckduckgo.request(query, dicto)
29
+        self.assertIn('hk-tzh', params['url'])
30
+
24 31
         dicto['language'] = 'en-GB'
25 32
         params = duckduckgo.request(query, dicto)
26 33
         self.assertIn('uk-en', params['url'])
27 34
 
28 35
         # no country given
29
-        duckduckgo.supported_languages = ['de-CH', 'en-US']
30
-        dicto['language'] = 'de'
36
+        dicto['language'] = 'en'
31 37
         params = duckduckgo.request(query, dicto)
32
-        self.assertIn('ch-de', params['url'])
38
+        self.assertIn('us-en', params['url'])
33 39
 
34 40
     def test_no_url_in_request_year_time_range(self):
35 41
         dicto = defaultdict(dict)

+ 1
- 0
tests/unit/engines/test_duckduckgo_definitions.py View File

@@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
18 18
         self.assertEqual(result, 'Text in link')
19 19
 
20 20
     def test_request(self):
21
+        duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
21 22
         query = 'test_query'
22 23
         dicto = defaultdict(dict)
23 24
         dicto['pageno'] = 1

+ 0
- 1
tests/unit/engines/test_duckduckgo_images.py View File

@@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11 11
         duckduckgo_images.supported_languages = ['de-CH', 'en-US']
12
-
13 12
         query = 'test_query'
14 13
         dicto = defaultdict(dict)
15 14
         dicto['is_test'] = True

+ 7
- 0
tests/unit/engines/test_google.py View File

@@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
15 15
         return response
16 16
 
17 17
     def test_request(self):
18
+        google.supported_languages = ['en', 'fr', 'zh-CN']
19
+
18 20
         query = 'test_query'
19 21
         dicto = defaultdict(dict)
20 22
         dicto['pageno'] = 1
@@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
31 33
         self.assertIn('google.co', params['url'])
32 34
         self.assertIn('en', params['headers']['Accept-Language'])
33 35
 
36
+        dicto['language'] = 'zh'
37
+        params = google.request(query, dicto)
38
+        self.assertIn('google.com', params['url'])
39
+        self.assertIn('zh-CN', params['headers']['Accept-Language'])
40
+
34 41
     def test_response(self):
35 42
         self.assertRaises(AttributeError, google.response, None)
36 43
         self.assertRaises(AttributeError, google.response, [])

+ 1
- 0
tests/unit/engines/test_google_news.py View File

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
9 9
 class TestGoogleNewsEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        google_news.supported_languages = ['en-US', 'fr-FR']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['pageno'] = 1

+ 1
- 1
tests/unit/engines/test_qwant.py View File

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestQwantEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
10 11
         query = 'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 0
@@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
26 27
         self.assertIn('en_us', params['url'])
27 28
         self.assertIn('news', params['url'])
28 29
 
29
-        qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
30 30
         dicto['language'] = 'fr'
31 31
         params = qwant.request(query, dicto)
32 32
         self.assertIn('fr_fr', params['url'])

+ 1
- 0
tests/unit/engines/test_swisscows.py View File

@@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
7 7
 class TestSwisscowsEngine(SearxTestCase):
8 8
 
9 9
     def test_request(self):
10
+        swisscows.supported_languages = ['de-AT', 'de-DE']
10 11
         query = 'test_query'
11 12
         dicto = defaultdict(dict)
12 13
         dicto['pageno'] = 1

+ 1
- 0
tests/unit/engines/test_wikidata.py View File

@@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
9 9
 class TestWikidataEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        wikidata.supported_languages = ['en', 'es']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['language'] = 'en-US'

+ 14
- 3
tests/unit/engines/test_yahoo.py View File

@@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
25 25
         self.assertEqual('https://this.is.the.url/', url)
26 26
 
27 27
     def test_request(self):
28
+        yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
28 29
         query = 'test_query'
29 30
         dicto = defaultdict(dict)
30 31
         dicto['pageno'] = 1
31 32
         dicto['time_range'] = ''
32
-        dicto['language'] = 'fr_FR'
33
+        dicto['language'] = 'fr-FR'
33 34
         params = yahoo.request(query, dicto)
34 35
         self.assertIn('url', params)
35 36
         self.assertIn(query, params['url'])
@@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
39 40
         self.assertIn('sB', params['cookies'])
40 41
         self.assertIn('fr', params['cookies']['sB'])
41 42
 
43
+        dicto['language'] = 'zh'
44
+        params = yahoo.request(query, dicto)
45
+        self.assertIn('zh_chs', params['url'])
46
+        self.assertIn('zh_chs', params['cookies']['sB'])
47
+
48
+        dicto['language'] = 'zh-TW'
49
+        params = yahoo.request(query, dicto)
50
+        self.assertIn('zh_cht', params['url'])
51
+        self.assertIn('zh_cht', params['cookies']['sB'])
52
+
42 53
     def test_no_url_in_request_year_time_range(self):
43 54
         dicto = defaultdict(dict)
44 55
         query = 'test_query'
@@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
168 179
         self.assertEqual(type(languages), list)
169 180
         self.assertEqual(len(languages), 3)
170 181
         self.assertIn('ar', languages)
171
-        self.assertIn('zh-chs', languages)
172
-        self.assertIn('zh-cht', languages)
182
+        self.assertIn('zh-CHS', languages)
183
+        self.assertIn('zh-CHT', languages)

+ 2
- 1
tests/unit/engines/test_yahoo_news.py View File

@@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
9 9
 class TestYahooNewsEngine(SearxTestCase):
10 10
 
11 11
     def test_request(self):
12
+        yahoo_news.supported_languages = ['en', 'fr']
12 13
         query = 'test_query'
13 14
         dicto = defaultdict(dict)
14 15
         dicto['pageno'] = 1
15
-        dicto['language'] = 'fr_FR'
16
+        dicto['language'] = 'fr-FR'
16 17
         params = yahoo_news.request(query, dicto)
17 18
         self.assertIn('url', params)
18 19
         self.assertIn(query, params['url'])

+ 25
- 0
tests/unit/test_utils.py View File

@@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
65 65
         for test_url, expected in data:
66 66
             self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
67 67
 
68
+    def test_match_language(self):
69
+        self.assertEqual(utils.match_language('es', ['es']), 'es')
70
+        self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
71
+        self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
72
+
73
+        aliases = {'en-GB': 'en-UK', 'he': 'iw'}
74
+
75
+        # guess country
76
+        self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
77
+        self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
78
+        self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
79
+        self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
80
+        self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
81
+        self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
82
+
83
+        # language aliases
84
+        self.assertEqual(utils.match_language('iw', ['he']), 'he')
85
+        self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
86
+        self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
87
+        self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
88
+        self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
89
+        self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
90
+        self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
91
+        self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
92
+
68 93
 
69 94
 class TestHTMLTextExtractor(SearxTestCase):
70 95
 

+ 131
- 131
utils/fetch_languages.py View File

@@ -2,83 +2,27 @@
2 2
 
3 3
 # This script generates languages.py from intersecting each engine's supported languages.
4 4
 #
5
-# The country names are obtained from http://api.geonames.org which requires registering as a user.
6
-#
7 5
 # Output files (engines_languages.json and languages.py)
8 6
 # are written in current directory to avoid overwriting in case something goes wrong.
9 7
 
10
-from requests import get
11
-from lxml.html import fromstring
12
-from json import loads, dump
8
+from json import dump
13 9
 import io
14 10
 from sys import path
11
+from babel import Locale, UnknownLocaleError
12
+from babel.languages import get_global
13
+
15 14
 path.append('../searx')  # noqa
16 15
 from searx import settings
17
-from searx.url_utils import urlencode
18 16
 from searx.engines import initialize_engines, engines
19 17
 
20
-# Geonames API for country names.
21
-geonames_user = ''  # ADD USER NAME HERE
22
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
23
-
24 18
 # Output files.
25 19
 engines_languages_file = 'engines_languages.json'
26 20
 languages_file = 'languages.py'
27 21
 
28
-engines_languages = {}
29
-
30
-
31
-# To filter out invalid codes and dialects.
32
-def valid_code(lang_code):
33
-    # filter invalid codes
34
-    # sl-SL is technically not invalid, but still a mistake
35
-    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
36
-    invalid_countries = ['UK', 'XA', 'XL']
37
-    if lang_code[:2] == 'xx'\
38
-       or lang_code in invalid_codes\
39
-       or lang_code[-2:] in invalid_countries\
40
-       or is_dialect(lang_code):
41
-        return False
42
-
43
-    return True
44
-
45
-
46
-# Language codes with any additional tags other than language and country.
47
-def is_dialect(lang_code):
48
-    lang_code = lang_code.split('-')
49
-    if len(lang_code) > 2 or len(lang_code[0]) > 3:
50
-        return True
51
-    if len(lang_code) == 2 and len(lang_code[1]) > 2:
52
-        return True
53
-
54
-    return False
55
-
56
-
57
-# Get country name in specified language.
58
-def get_country_name(locale):
59
-    if geonames_user is '':
60
-        return ''
61
-
62
-    locale = locale.split('-')
63
-    if len(locale) != 2:
64
-        return ''
65
-
66
-    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
67
-                                                         'country': locale[1],
68
-                                                         'username': geonames_user}))
69
-    response = get(url)
70
-    json = loads(response.text)
71
-    content = json.get('geonames', None)
72
-    if content is None or len(content) != 1:
73
-        print("No country name found for " + locale[0] + "-" + locale[1])
74
-        return ''
75
-
76
-    return content[0].get('countryName', '')
77
-
78 22
 
79 23
 # Fetchs supported languages for each engine and writes json file with those.
80 24
 def fetch_supported_languages():
81
-    initialize_engines(settings['engines'])
25
+    engines_languages = {}
82 26
     for engine_name in engines:
83 27
         if hasattr(engines[engine_name], 'fetch_supported_languages'):
84 28
             try:
@@ -90,81 +34,135 @@ def fetch_supported_languages():
90 34
     with io.open(engines_languages_file, "w", encoding="utf-8") as f:
91 35
         dump(engines_languages, f, ensure_ascii=False)
92 36
 
37
+    return engines_languages
38
+
39
+
40
+# Get babel Locale object from lang_code if possible.
41
+def get_locale(lang_code):
42
+    try:
43
+        locale = Locale.parse(lang_code, sep='-')
44
+        return locale
45
+    except (UnknownLocaleError, ValueError):
46
+        return None
47
+
48
+
49
+# Append engine_name to list of engines that support locale.
50
+def add_engine_counter(lang_code, engine_name, languages):
51
+    if lang_code in languages:
52
+        if 'counter' not in languages[lang_code]:
53
+            languages[lang_code]['counter'] = [engine_name]
54
+        elif engine_name not in languages[lang_code]['counter']:
55
+            languages[lang_code]['counter'].append(engine_name)
93 56
 
94
-# Join all language lists.
95
-# Iterate all languages supported by each engine.
96
-def join_language_lists():
97
-    global languages
98
-    # include wikipedia first for more accurate language names
99
-    languages = {code: lang for code, lang
100
-                 in engines_languages['wikipedia'].items()
101
-                 if valid_code(code)}
102 57
 
58
+# Join all language lists.
59
+# TODO: Add language names from engine's language list if name not known by babel.
60
+def join_language_lists(engines_languages):
61
+    language_list = {}
103 62
     for engine_name in engines_languages:
104
-        for locale in engines_languages[engine_name]:
105
-            if valid_code(locale):
106
-                # if language is not on list or if it has no name yet
107
-                if locale not in languages or not languages[locale].get('name'):
108
-                    if isinstance(engines_languages[engine_name], dict):
109
-                        languages[locale] = engines_languages[engine_name][locale]
110
-                    else:
111
-                        languages[locale] = {}
112
-
113
-            # add to counter of engines that support given language
114
-            lang = locale.split('-')[0]
115
-            if lang in languages:
116
-                if 'counter' not in languages[lang]:
117
-                    languages[lang]['counter'] = [engine_name]
118
-                elif engine_name not in languages[lang]['counter']:
119
-                    languages[lang]['counter'].append(engine_name)
120
-
121
-    # filter list to include only languages supported by most engines
122
-    min_supported_engines = int(0.70 * len(engines_languages))
123
-    languages = {code: lang for code, lang
124
-                 in languages.items()
125
-                 if len(lang.get('counter', [])) >= min_supported_engines or
126
-                 len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
127
-
128
-    # get locales that have no name or country yet
129
-    for locale in languages.keys():
130
-        # try to get language names
131
-        if not languages[locale].get('name'):
132
-            name = languages.get(locale.split('-')[0], {}).get('name', None)
133
-            if name:
134
-                languages[locale]['name'] = name
135
-            else:
136
-                # filter out locales with no name
137
-                del languages[locale]
138
-                continue
139
-
140
-        # try to get language name in english
141
-        if not languages[locale].get('english_name'):
142
-            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
143
-
144
-        # try to get country name
145
-        if locale.find('-') > 0 and not languages[locale].get('country'):
146
-            languages[locale]['country'] = get_country_name(locale) or ''
147
-
148
-
149
-# Remove countryless language if language is featured in only one country.
150
-def filter_single_country_languages():
151
-    prev_lang = None
152
-    prev_code = None
153
-    for code in sorted(languages):
154
-        lang = code.split('-')[0]
155
-        if lang == prev_lang:
63
+        for lang_code in engines_languages[engine_name]:
64
+
65
+            # apply custom fixes if necessary
66
+            if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
67
+                lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
68
+                                 if lang_code == alias)
69
+
70
+            locale = get_locale(lang_code)
71
+
72
+            # ensure that lang_code uses standard language and country codes
73
+            if locale and locale.territory:
74
+                lang_code = locale.language + '-' + locale.territory
75
+
76
+            # add locale if it's not in list
77
+            if lang_code not in language_list:
78
+                if locale:
79
+                    language_list[lang_code] = {'name': locale.get_language_name().title(),
80
+                                                'english_name': locale.english_name,
81
+                                                'country': locale.get_territory_name() or ''}
82
+
83
+                    # also add language without country
84
+                    if locale.language not in language_list:
85
+                        language_list[locale.language] = {'name': locale.get_language_name().title(),
86
+                                                          'english_name': locale.english_name}
87
+                else:
88
+                    language_list[lang_code] = {}
89
+
90
+            # count engine for both language_country combination and language alone
91
+            add_engine_counter(lang_code, engine_name, language_list)
92
+            add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
93
+
94
+    return language_list
95
+
96
+
97
+# Filter language list so it only includes the most supported languages and countries.
98
+def filter_language_list(all_languages):
99
+    min_supported_engines = 10
100
+    main_engines = [engine_name for engine_name in engines.keys()
101
+                    if 'general' in engines[engine_name].categories and
102
+                       engines[engine_name].supported_languages and
103
+                       not engines[engine_name].disabled]
104
+
105
+    # filter list to include only languages supported by most engines or all default general engines
106
+    filtered_languages = {code: lang for code, lang
107
+                          in all_languages.items()
108
+                          if (len(lang.get('counter', [])) >= min_supported_engines or
109
+                              all(main_engine in lang.get('counter', [])
110
+                                  for main_engine in main_engines))}
111
+
112
+    return filtered_languages
113
+
114
+
115
+# Add country codes to languages without one and filter out language codes.
116
+def assign_country_codes(filtered_languages, all_languages):
117
+    sorted_languages = sorted(all_languages,
118
+                              key=lambda lang: len(all_languages[lang].get('counter', [])),
119
+                              reverse=True)
120
+    previous_lang = None
121
+    previous_code = None
122
+    countries = 0
123
+    for current_code in sorted(filtered_languages):
124
+        current_lang = current_code.split('-')[0]
125
+
126
+        # count country codes per language
127
+        if current_lang == previous_lang:
156 128
             countries += 1
129
+
157 130
         else:
158
-            if prev_lang is not None and countries == 1:
159
-                del languages[prev_lang]
160
-                languages[prev_code]['country'] = ''
131
+            if previous_lang is not None:
132
+                # if language has no single country code
133
+                if countries == 0:
134
+                    # try to get country code with most supported engines
135
+                    for l in sorted_languages:
136
+                        l_parts = l.split('-')
137
+                        if len(l_parts) == 2 and l_parts[0] == previous_lang:
138
+                            filtered_languages[l] = all_languages[l]
139
+                            filtered_languages[l]['country'] = ''
140
+                            countries = 1
141
+                            break
142
+
143
+                    if countries == 0:
144
+                        # get most likely country code from babel
145
+                        subtags = get_global('likely_subtags').get(previous_lang)
146
+                        if subtags:
147
+                            subtag_parts = subtags.split('_')
148
+                            new_code = subtag_parts[0] + '-' + subtag_parts[-1]
149
+                            filtered_languages[new_code] = all_languages[previous_lang]
150
+                            countries = 1
151
+
152
+                if countries == 1:
153
+                    # remove countryless version of language if there's only one country
154
+                    del filtered_languages[previous_lang]
155
+                    if previous_code in filtered_languages:
156
+                        filtered_languages[previous_code]['country'] = ''
157
+
161 158
             countries = 0
162
-            prev_lang = lang
163
-        prev_code = code
159
+            previous_lang = current_lang
160
+
161
+        previous_code = current_code
164 162
 
165 163
 
166 164
 # Write languages.py.
167
-def write_languages_file():
165
+def write_languages_file(languages):
168 166
     new_file = open(languages_file, 'wb')
169 167
     file_content = '# -*- coding: utf-8 -*-\n'\
170 168
                    + '# list of language codes\n'\
@@ -183,7 +181,9 @@ def write_languages_file():
183 181
 
184 182
 
185 183
 if __name__ == "__main__":
186
-    fetch_supported_languages()
187
-    join_language_lists()
188
-    filter_single_country_languages()
189
-    write_languages_file()
184
+    initialize_engines(settings['engines'])
185
+    engines_languages = fetch_supported_languages()
186
+    all_languages = join_language_lists(engines_languages)
187
+    filtered_languages = filter_language_list(all_languages)
188
+    assign_country_codes(filtered_languages, all_languages)
189
+    write_languages_file(filtered_languages)