Browse Source

Merge pull request #615 from mmuman/scanr

Add ScanR structures search engine
Adam Tauber 8 years ago
parent
commit
f7a3e9771d
3 changed files with 258 additions and 0 deletions
  1. 78
    0
      searx/engines/scanr_structures.py
  2. 5
    0
      searx/settings.yml
  3. 175
    0
      tests/unit/engines/test_scanr_structures.py

+ 78
- 0
searx/engines/scanr_structures.py View File

@@ -0,0 +1,78 @@
1
+"""
2
+ ScanR Structures (Science)
3
+
4
+ @website     https://scanr.enseignementsup-recherche.gouv.fr
5
+ @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html)
6
+
7
+ @using-api   yes
8
+ @results     JSON
9
+ @stable      yes
10
+ @parse       url, title, content, img_src
11
+"""
12
+
13
+from urllib import urlencode
14
+from json import loads, dumps
15
+from dateutil import parser
16
+from searx.utils import html_to_text
17
+
18
+# engine dependent config
19
+categories = ['science']
20
+paging = True
21
+page_size = 20
22
+
23
+# search-url
24
+url = 'https://scanr.enseignementsup-recherche.gouv.fr/'
25
+search_url = url + 'api/structures/search'
26
+
27
+
28
+# do search-request
29
+def request(query, params):
30
+
31
+    params['url'] = search_url
32
+    params['method'] = 'POST'
33
+    params['headers']['Content-type'] = "application/json"
34
+    params['data'] = dumps({"query": query,
35
+                            "searchField": "ALL",
36
+                            "sortDirection": "ASC",
37
+                            "sortOrder": "RELEVANCY",
38
+                            "page": params['pageno'],
39
+                            "pageSize": page_size})
40
+
41
+    return params
42
+
43
+
44
+# get response from search-request
45
+def response(resp):
46
+    results = []
47
+
48
+    search_res = loads(resp.text)
49
+
50
+    # return empty array if there are no results
51
+    if search_res.get('total') < 1:
52
+        return []
53
+
54
+    # parse results
55
+    for result in search_res['results']:
56
+        if 'id' not in result:
57
+            continue
58
+
59
+        # is it thumbnail or img_src??
60
+        thumbnail = None
61
+        if 'logo' in result:
62
+            thumbnail = result['logo']
63
+            if thumbnail[0] == '/':
64
+                thumbnail = url + thumbnail
65
+
66
+        content = None
67
+        if 'highlights' in result:
68
+            content = result['highlights'][0]['value']
69
+
70
+        # append result
71
+        results.append({'url': url + 'structure/' + result['id'],
72
+                        'title': result['label'],
73
+                        # 'thumbnail': thumbnail,
74
+                        'img_src': thumbnail,
75
+                        'content': html_to_text(content)})
76
+
77
+    # return results
78
+    return results

+ 5
- 0
searx/settings.yml View File

@@ -314,6 +314,11 @@ engines:
314 314
     engine : kickass
315 315
     shortcut : ka
316 316
 
317
+  - name : scanr_structures
318
+    shortcut: scs
319
+    engine : scanr_structures
320
+    disabled : True
321
+
317 322
   - name : soundcloud
318 323
     engine : soundcloud
319 324
     shortcut : sc

+ 175
- 0
tests/unit/engines/test_scanr_structures.py View File

@@ -0,0 +1,175 @@
1
+from collections import defaultdict
2
+import mock
3
+from searx.engines import scanr_structures
4
+from searx.testing import SearxTestCase
5
+
6
+
7
+class TestScanrStructuresEngine(SearxTestCase):
8
+
9
+    def test_request(self):
10
+        query = 'test_query'
11
+        dicto = defaultdict(dict)
12
+        dicto['pageno'] = 1
13
+        params = scanr_structures.request(query, dicto)
14
+        self.assertIn('url', params)
15
+        self.assertIn(query, params['data'])
16
+        self.assertIn('scanr.enseignementsup-recherche.gouv.fr', params['url'])
17
+
18
+    def test_response(self):
19
+        self.assertRaises(AttributeError, scanr_structures.response, None)
20
+        self.assertRaises(AttributeError, scanr_structures.response, [])
21
+        self.assertRaises(AttributeError, scanr_structures.response, '')
22
+        self.assertRaises(AttributeError, scanr_structures.response, '[]')
23
+
24
+        response = mock.Mock(text='{}')
25
+        self.assertEqual(scanr_structures.response(response), [])
26
+
27
+        response = mock.Mock(text='{"data": []}')
28
+        self.assertEqual(scanr_structures.response(response), [])
29
+
30
+        json = u"""
31
+        {
32
+          "request":
33
+            {
34
+              "query":"test_query",
35
+              "page":1,
36
+              "pageSize":20,
37
+              "sortOrder":"RELEVANCY",
38
+              "sortDirection":"ASC",
39
+              "searchField":"ALL",
40
+              "from":0
41
+            },
42
+          "total":2471,
43
+          "results":[
44
+            {
45
+              "id":"200711886U",
46
+              "label":"Laboratoire d'Informatique de Grenoble",
47
+              "kind":"RNSR",
48
+              "publicEntity":true,
49
+              "address":{"city":"Grenoble","departement":"38"},
50
+              "logo":"/static/logos/200711886U.png",
51
+              "acronym":"LIG",
52
+              "type":{"code":"UR","label":"Unit\xe9 de recherche"},
53
+              "level":2,
54
+              "institutions":[
55
+                {
56
+                  "id":"193819125",
57
+                  "label":"Grenoble INP",
58
+                  "acronym":"IPG",
59
+                  "code":"UMR 5217"
60
+                },
61
+                {
62
+                  "id":"130021397",
63
+                  "label":"Universit\xe9 de Grenoble Alpes",
64
+                  "acronym":"UGA",
65
+                  "code":"UMR 5217"
66
+                },
67
+                {
68
+                  "id":"180089013",
69
+                  "label":"Centre national de la recherche scientifique",
70
+                  "acronym":"CNRS",
71
+                  "code":"UMR 5217"
72
+                },
73
+                {
74
+                  "id":"180089047",
75
+                  "label":"Institut national de recherche en informatique et en automatique",
76
+                  "acronym":"Inria",
77
+                  "code":"UMR 5217"
78
+                }
79
+              ],
80
+              "highlights":[
81
+                {
82
+                  "type":"projects",
83
+                  "value":"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP\
84
+ du <strong>LIG</strong> en tant que prototypes op\xe9rationnels.\
85
+\\r\\nDans le contexte"
86
+                },
87
+                {
88
+                  "type":"acronym",
89
+                  "value":"<strong>LIG</strong>"
90
+                },
91
+                {
92
+                  "type":"websiteContents",
93
+                  "value":"S\xe9lection\\nListe structures\\nD\xe9tail\\n\
94
+                    Accueil\\n200711886U : <strong>LIG</strong>\
95
+                    Laboratoire d'Informatique de Grenoble Unit\xe9 de recherche"},
96
+                {
97
+                  "type":"publications",
98
+                  "value":"de noms. Nous avons d'abord d\xe9velopp\xe9 LOOV \
99
+                    (pour <strong>Lig</strong> Overlaid OCR in Vid\xe9o), \
100
+                    un outil d'extraction des"
101
+                }
102
+              ]
103
+            },
104
+            {
105
+              "id":"199511665F",
106
+              "label":"Laboratoire Bordelais de Recherche en Informatique",
107
+              "kind":"RNSR",
108
+              "publicEntity":true,
109
+              "address":{"city":"Talence","departement":"33"},
110
+              "logo":"/static/logos/199511665F.png",
111
+              "acronym":"LaBRI",
112
+              "type":{"code":"UR","label":"Unit\xe9 de recherche"},
113
+              "level":2,
114
+              "institutions":[
115
+                {
116
+                  "id":"130006356",
117
+                  "label":"Institut polytechnique de Bordeaux",
118
+                  "acronym":"IPB",
119
+                  "code":"UMR 5800"
120
+                },
121
+                {
122
+                  "id":"130018351",
123
+                  "label":"Universit\xe9 de Bordeaux",
124
+                  "acronym":null,
125
+                  "code":"UMR 5800"
126
+                },
127
+                {
128
+                  "id":"180089013",
129
+                  "label":"Centre national de la recherche scientifique",
130
+                  "acronym":"CNRS",
131
+                  "code":"UMR 5800"
132
+                },
133
+                {
134
+                  "id":"180089047",
135
+                  "label":"Institut national de recherche en informatique et en automatique",
136
+                  "acronym":"Inria",
137
+                  "code":"UMR 5800"
138
+                }
139
+              ],
140
+              "highlights":[
141
+                {
142
+                  "type":"websiteContents",
143
+                  "value":"Samia Kerdjoudj\\n2016-07-05\\nDouble-exponential\
144
+ and <strong>triple</strong>-exponential bounds for\
145
+ choosability problems parameterized"
146
+                },
147
+                {
148
+                  "type":"publications",
149
+                  "value":"de cam\xe9ras install\xe9es dans les lieux publiques \
150
+ a <strong>tripl\xe9</strong> en 2009, passant de 20 000 \
151
+ \xe0 60 000. Malgr\xe9 le"
152
+                }
153
+              ]
154
+            }
155
+          ]
156
+        }
157
+        """
158
+        response = mock.Mock(text=json)
159
+        results = scanr_structures.response(response)
160
+        self.assertEqual(type(results), list)
161
+        self.assertEqual(len(results), 2)
162
+        self.assertEqual(results[0]['title'], u"Laboratoire d'Informatique de Grenoble")
163
+        self.assertEqual(results[0]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/200711886U')
164
+        self.assertEqual(results[0]['content'],
165
+                         u"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP "
166
+                         u"du LIG en tant que prototypes "
167
+                         u"op\xe9rationnels. Dans le contexte")
168
+        self.assertEqual(results[1]['img_src'],
169
+                         'https://scanr.enseignementsup-recherche.gouv.fr//static/logos/199511665F.png')
170
+        self.assertEqual(results[1]['content'],
171
+                         "Samia Kerdjoudj 2016-07-05 Double-exponential and"
172
+                         " triple-exponential bounds for "
173
+                         "choosability problems parameterized")
174
+        self.assertEqual(results[1]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/199511665F')
175
+        self.assertEqual(results[1]['title'], u"Laboratoire Bordelais de Recherche en Informatique")