瀏覽代碼

add duckduckgo images engine

marc 7 年之前
父節點
當前提交
c65a409f0d
共有 4 個檔案被更改,包括 204 行新增30 行删除
  1. 35
    30
      searx/engines/duckduckgo.py
  2. 91
    0
      searx/engines/duckduckgo_images.py
  3. 6
    0
      searx/settings.yml
  4. 72
    0
      tests/unit/engines/test_duckduckgo_images.py

+ 35
- 30
searx/engines/duckduckgo.py 查看文件

@@ -41,46 +41,51 @@ title_xpath = './/a[@class="result__a"]'
41 41
 content_xpath = './/a[@class="result__snippet"]'
42 42
 
43 43
 
44
-# do search-request
45
-def request(query, params):
46
-    if params['time_range'] and params['time_range'] not in time_range_dict:
47
-        return params
48
-
49
-    offset = 30 + (params['pageno'] - 1) * 50
50
-    dc_param = offset + 1
51
-
44
+# match query's language to a region code that duckduckgo will accept
45
+def get_region_code(lang):
52 46
     # custom fixes for languages
53
-    if params['language'] == 'all':
54
-        locale = None
55
-    elif params['language'][:2] == 'ja':
56
-        locale = 'jp-jp'
57
-    elif params['language'][:2] == 'sl':
58
-        locale = 'sl-sl'
59
-    elif params['language'] == 'zh-TW':
60
-        locale = 'tw-tzh'
61
-    elif params['language'] == 'zh-HK':
62
-        locale = 'hk-tzh'
63
-    elif params['language'][-2:] == 'SA':
64
-        locale = 'xa-' + params['language'].split('-')[0]
65
-    elif params['language'][-2:] == 'GB':
66
-        locale = 'uk-' + params['language'].split('-')[0]
47
+    if lang == 'all':
48
+        region_code = None
49
+    elif lang[:2] == 'ja':
50
+        region_code = 'jp-jp'
51
+    elif lang[:2] == 'sl':
52
+        region_code = 'sl-sl'
53
+    elif lang == 'zh-TW':
54
+        region_code = 'tw-tzh'
55
+    elif lang == 'zh-HK':
56
+        region_code = 'hk-tzh'
57
+    elif lang[-2:] == 'SA':
58
+        region_code = 'xa-' + lang.split('-')[0]
59
+    elif lang[-2:] == 'GB':
60
+        region_code = 'uk-' + lang.split('-')[0]
67 61
     else:
68
-        locale = params['language'].split('-')
69
-        if len(locale) == 2:
62
+        region_code = lang.split('-')
63
+        if len(region_code) == 2:
70 64
             # country code goes first
71
-            locale = locale[1].lower() + '-' + locale[0].lower()
65
+            region_code = region_code[1].lower() + '-' + region_code[0].lower()
72 66
         else:
73 67
             # tries to get a country code from language
74
-            locale = locale[0].lower()
68
+            region_code = region_code[0].lower()
75 69
             for lc in supported_languages:
76 70
                 lc = lc.split('-')
77
-                if locale == lc[0]:
78
-                    locale = lc[1].lower() + '-' + lc[0].lower()
71
+                if region_code == lc[0]:
72
+                    region_code = lc[1].lower() + '-' + lc[0].lower()
79 73
                     break
74
+    return region_code
75
+
76
+
77
+# do search-request
78
+def request(query, params):
79
+    if params['time_range'] and params['time_range'] not in time_range_dict:
80
+        return params
81
+
82
+    offset = 30 + (params['pageno'] - 1) * 50
83
+    dc_param = offset + 1
80 84
 
81
-    if locale:
85
+    region_code = get_region_code(params['language'])
86
+    if region_code:
82 87
         params['url'] = url.format(
83
-            query=urlencode({'q': query, 'kl': locale}), offset=offset, dc_param=dc_param)
88
+            query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=dc_param)
84 89
     else:
85 90
         params['url'] = url.format(
86 91
             query=urlencode({'q': query}), offset=offset, dc_param=dc_param)

+ 91
- 0
searx/engines/duckduckgo_images.py 查看文件

@@ -0,0 +1,91 @@
1
+"""
2
+ DuckDuckGo (Images)
3
+
4
+ @website     https://duckduckgo.com/
5
+ @provide-api yes (https://duckduckgo.com/api),
6
+              but images are not supported
7
+
8
+ @using-api   no
9
+ @results     JSON (site requires js to get images)
10
+ @stable      no (JSON can change)
11
+ @parse       url, title, img_src
12
+
13
+ @todo        avoid extra request
14
+"""
15
+
16
+from requests import get
17
+from json import loads
18
+from searx.engines.xpath import extract_text
19
+from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
20
+from searx.url_utils import urlencode
21
+
22
+# engine dependent config
23
+categories = ['images']
24
+paging = True
25
+language_support = True
26
+safesearch = True
27
+
28
+# search-url
29
+images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
30
+site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
31
+
32
+
33
+# run query in site to get vqd number needed for requesting images
34
+# TODO: find a way to get this number without an extra request (is it a hash of the query?)
35
+def get_vqd(query):
36
+    res = get(site_url.format(query=urlencode({'q': query})))
37
+    content = res.text
38
+    vqd = content[content.find('vqd=\'') + 5:]
39
+    vqd = vqd[:vqd.find('\'')]
40
+    return vqd
41
+
42
+
43
+# do search-request
44
+def request(query, params):
45
+    # to avoid running actual external requests when testing
46
+    if 'is_test' not in params:
47
+        vqd = get_vqd(query)
48
+    else:
49
+        vqd = '12345'
50
+
51
+    offset = (params['pageno'] - 1) * 50
52
+
53
+    safesearch = params['safesearch'] - 1
54
+
55
+    region_code = get_region_code(params['language'])
56
+    if region_code:
57
+        params['url'] = images_url.format(
58
+            query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd)
59
+    else:
60
+        params['url'] = images_url.format(
61
+            query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
62
+
63
+    return params
64
+
65
+
66
+# get response from search-request
67
+def response(resp):
68
+    results = []
69
+
70
+    content = resp.text
71
+    try:
72
+        res_json = loads(content)
73
+    except:
74
+        return []
75
+
76
+    # parse results
77
+    for result in res_json['results']:
78
+        title = result['title']
79
+        url = result['url']
80
+        thumbnail = result['thumbnail']
81
+        image = result['image']
82
+
83
+        # append result
84
+        results.append({'template': 'images.html',
85
+                        'title': title,
86
+                        'content': '',
87
+                        'thumbnail_src': thumbnail,
88
+                        'img_src': image,
89
+                        'url': url})
90
+
91
+    return results

+ 6
- 0
searx/settings.yml 查看文件

@@ -167,6 +167,12 @@ engines:
167 167
     shortcut : ddg
168 168
     disabled : True
169 169
 
170
+  - name : duckduckgo images
171
+    engine : duckduckgo_images
172
+    shortcut : ddi
173
+    timeout: 3.0
174
+    disabled : True
175
+
170 176
   - name : etymonline
171 177
     engine : xpath
172 178
     paging : True

+ 72
- 0
tests/unit/engines/test_duckduckgo_images.py 查看文件

@@ -0,0 +1,72 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import duckduckgo_images
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestDuckduckgoImagesEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['is_test'] = True
14
+        dicto['pageno'] = 1
15
+        dicto['safesearch'] = 0
16
+        dicto['language'] = 'all'
17
+        params = duckduckgo_images.request(query, dicto)
18
+        self.assertIn('url', params)
19
+        self.assertIn(query, params['url'])
20
+        self.assertIn('duckduckgo.com', params['url'])
21
+        self.assertIn('s=0', params['url'])
22
+        self.assertIn('p=-1', params['url'])
23
+        self.assertIn('vqd=12345', params['url'])
24
+
25
+        # test paging and safe search
26
+        dicto['pageno'] = 2
27
+        dicto['safesearch'] = 2
28
+        params = duckduckgo_images.request(query, dicto)
29
+        self.assertIn('url', params)
30
+        self.assertIn(query, params['url'])
31
+        self.assertIn('s=50', params['url'])
32
+        self.assertIn('p=1', params['url'])
33
+
34
+    def test_response(self):
35
+        self.assertRaises(AttributeError, duckduckgo_images.response, None)
36
+        self.assertRaises(AttributeError, duckduckgo_images.response, [])
37
+        self.assertRaises(AttributeError, duckduckgo_images.response, '')
38
+        self.assertRaises(AttributeError, duckduckgo_images.response, '[]')
39
+
40
+        response = mock.Mock(text='If this error persists, please let us know: ops@duckduckgo.com')
41
+        self.assertEqual(duckduckgo_images.response(response), [])
42
+
43
+        json = u"""
44
+        {
45
+            "query": "test_query",
46
+            "results": [
47
+                {
48
+                    "title": "Result 1",
49
+                    "url": "https://site1.url",
50
+                    "thumbnail": "https://thumb1.nail",
51
+                    "image": "https://image1"
52
+                },
53
+                {
54
+                    "title": "Result 2",
55
+                    "url": "https://site2.url",
56
+                    "thumbnail": "https://thumb2.nail",
57
+                    "image": "https://image2"
58
+                }
59
+            ]
60
+        }
61
+        """
62
+        response = mock.Mock(text=json)
63
+        results = duckduckgo_images.response(response)
64
+        self.assertEqual(len(results), 2)
65
+        self.assertEqual(results[0]['title'], 'Result 1')
66
+        self.assertEqual(results[0]['url'], 'https://site1.url')
67
+        self.assertEqual(results[0]['thumbnail_src'], 'https://thumb1.nail')
68
+        self.assertEqual(results[0]['img_src'], 'https://image1')
69
+        self.assertEqual(results[1]['title'], 'Result 2')
70
+        self.assertEqual(results[1]['url'], 'https://site2.url')
71
+        self.assertEqual(results[1]['thumbnail_src'], 'https://thumb2.nail')
72
+        self.assertEqual(results[1]['img_src'], 'https://image2')