Przeglądaj źródła

Merge pull request #1136 from kvch/add-findx-general

Add findx engine
Adam Tauber 6 lat temu
rodzic
commit
0cb55ddfde
Brak konta powiązanego z e-mailem autora
4 zmienionych plików z 151 dodań i 6 usunięć
  1. 115
    0
      searx/engines/findx.py
  2. 11
    4
      searx/query.py
  3. 7
    2
      searx/search.py
  4. 18
    0
      searx/settings.yml

+ 115
- 0
searx/engines/findx.py Wyświetl plik

@@ -0,0 +1,115 @@
1
+"""
2
+FindX (General, Images, Videos)
3
+
4
+@website     https://www.findx.com
5
+@provide-api no
6
+@using-api   no
7
+@results     HTML
8
+@stable      no
9
+@parse       url, title, content, embedded, img_src, thumbnail_src
10
+"""
11
+
12
+from dateutil import parser
13
+from json import loads
14
+import re
15
+
16
+from lxml import html
17
+
18
+from searx import logger
19
+from searx.engines.xpath import extract_text
20
+from searx.engines.youtube_noapi import base_youtube_url, embedded_url
21
+from searx.url_utils import urlencode
22
+
23
+
24
+paging = True
25
+results_xpath = '//script[@id="initial-state"]'
26
+search_url = 'https://www.findx.com/{category}?{q}'
27
+type_map = {
28
+    'none': 'web',
29
+    'general': 'web',
30
+    'images': 'images',
31
+    'videos': 'videos',
32
+}
33
+
34
+
35
+def request(query, params):
36
+    params['url'] = search_url.format(
37
+        category=type_map[params['category']],
38
+        q=urlencode({
39
+            'q': query,
40
+            'page': params['pageno']
41
+        })
42
+    )
43
+    return params
44
+
45
+
46
+def response(resp):
47
+    dom = html.fromstring(resp.text)
48
+    results_raw_json = dom.xpath(results_xpath)
49
+    results_json = loads(extract_text(results_raw_json))
50
+
51
+    if len(results_json['web']['results']) > 0:
52
+        return _general_results(results_json['web']['results'])
53
+
54
+    if len(results_json['images']['results']) > 0:
55
+        return _images_results(results_json['images']['results'])
56
+
57
+    if len(results_json['video']['results']) > 0:
58
+        return _videos_results(results_json['video']['results'])
59
+
60
+    return []
61
+
62
+
63
+def _general_results(general_results):
64
+    results = []
65
+    for result in general_results:
66
+        results.append({
67
+            'url': result['url'],
68
+            'title': result['title'],
69
+            'content': result['sum'],
70
+        })
71
+    return results
72
+
73
+
74
+def _images_results(image_results):
75
+    results = []
76
+    for result in image_results:
77
+        results.append({
78
+            'url': result['sourceURL'],
79
+            'title': result['title'],
80
+            'content': result['source'],
81
+            'thumbnail_src': _extract_url(result['assets']['thumb']['url']),
82
+            'img_src': _extract_url(result['assets']['file']['url']),
83
+            'template': 'images.html',
84
+        })
85
+    return results
86
+
87
+
88
+def _videos_results(video_results):
89
+    results = []
90
+    for result in video_results:
91
+        if not result['kind'].startswith('youtube'):
92
+            logger.warn('Unknown video kind in findx: {}'.format(result['kind']))
93
+            continue
94
+
95
+        description = result['snippet']['description']
96
+        if len(description) > 300:
97
+            description = description[:300] + '...'
98
+
99
+        results.append({
100
+            'url': base_youtube_url + result['id'],
101
+            'title': result['snippet']['title'],
102
+            'content': description,
103
+            'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']),
104
+            'publishedDate': parser.parse(result['snippet']['publishedAt']),
105
+            'embedded': embedded_url.format(videoid=result['id']),
106
+            'template': 'videos.html',
107
+        })
108
+    return results
109
+
110
+
111
+def _extract_url(url):
112
+    matching = re.search('(/https?://[^)]+)', url)
113
+    if matching:
114
+        return matching.group(0)[1:]
115
+    return ''

+ 11
- 4
searx/query.py Wyświetl plik

@@ -107,14 +107,21 @@ class RawTextQuery(object):
107 107
                 # check if prefix is equal with engine shortcut
108 108
                 if prefix in engine_shortcuts:
109 109
                     parse_next = True
110
-                    self.engines.append({'category': 'none',
111
-                                         'name': engine_shortcuts[prefix]})
110
+                    engine_name = engine_shortcuts[prefix]
111
+                    if engine_name in engines:
112
+                        for engine_category in engines[engine_name].categories:
113
+                            self.engines.append({'category': engine_category,
114
+                                                 'name': engine_name,
115
+                                                 'from_bang': True})
112 116
 
113 117
                 # check if prefix is equal with engine name
114 118
                 elif prefix in engines:
115 119
                     parse_next = True
116
-                    self.engines.append({'category': 'none',
117
-                                         'name': prefix})
120
+                    if prefix in engines:
121
+                        for engine_category in engines[engine_name].categories:
122
+                            self.engines.append({'category': engine_category,
123
+                                                 'name': engine_name,
124
+                                                 'from_bang': True})
118 125
 
119 126
                 # check if prefix is equal with categorie name
120 127
                 elif prefix in categories:

+ 7
- 2
searx/search.py Wyświetl plik

@@ -258,8 +258,13 @@ def get_search_query_from_webapp(preferences, form):
258 258
     # if engines are calculated from query,
259 259
     # set categories by using that informations
260 260
     if query_engines and raw_text_query.specific:
261
-        query_categories = list(set(engine['category']
262
-                                    for engine in query_engines))
261
+        additional_categories = set()
262
+        for engine in query_engines:
263
+            if 'from_bang' in engine and engine['from_bang']:
264
+                additional_categories.add('none')
265
+            else:
266
+                additional_categories.add(engine['category'])
267
+        query_categories = list(additional_categories)
263 268
 
264 269
     # otherwise, using defined categories to
265 270
     # calculate which engines should be used

+ 18
- 0
searx/settings.yml Wyświetl plik

@@ -218,6 +218,24 @@ engines:
218 218
     shortcut : fd
219 219
     disabled : True
220 220
 
221
+  - name : findx
222
+    engine : findx
223
+    shortcut : fx
224
+    categories : general
225
+    disabled : True
226
+
227
+  - name : findx images
228
+    engine : findx
229
+    shortcut : fxi
230
+    categories : images
231
+    disabled : True
232
+
233
+  - name : findx videos
234
+    engine : findx
235
+    shortcut : fxv
236
+    categories : videos
237
+    disabled : True
238
+
221 239
   - name : flickr
222 240
     categories : images
223 241
     shortcut : fl