Browse Source

Merge pull request #588 from a01200356/wikidata

[enh] More data from Wikidata
Adam Tauber 8 years ago
parent
commit
f1262ffa9e

+ 1
- 0
.gitignore View File

@@ -1,4 +1,5 @@
1 1
 .coverage
2
+coverage/
2 3
 .installed.cfg
3 4
 engines.cfg
4 5
 env

+ 395
- 229
searx/engines/wikidata.py View File

@@ -1,56 +1,86 @@
1
-import json
1
+# -*- coding: utf-8 -*-
2
+"""
3
+ Wikidata
4
+
5
+ @website     https://wikidata.org
6
+ @provide-api yes (https://wikidata.org/w/api.php)
7
+
8
+ @using-api   partially (most things require scraping)
9
+ @results     JSON, HTML
10
+ @stable      no (html can change)
11
+ @parse       url, infobox
12
+"""
2 13
 
3 14
 from searx import logger
4 15
 from searx.poolrequests import get
5
-from searx.utils import format_date_by_locale
16
+from searx.engines.xpath import extract_text
6 17
 
7
-from datetime import datetime
8
-from dateutil.parser import parse as dateutil_parse
18
+from json import loads
19
+from lxml.html import fromstring
9 20
 from urllib import urlencode
10 21
 
11
-
12 22
 logger = logger.getChild('wikidata')
13 23
 result_count = 1
24
+
25
+# urls
14 26
 wikidata_host = 'https://www.wikidata.org'
27
+url_search = wikidata_host \
28
+    + '/wiki/Special:ItemDisambiguation?{query}'
29
+
15 30
 wikidata_api = wikidata_host + '/w/api.php'
16
-url_search = wikidata_api \
17
-    + '?action=query&list=search&format=json'\
18
-    + '&srnamespace=0&srprop=sectiontitle&{query}'
19 31
 url_detail = wikidata_api\
20
-    + '?action=wbgetentities&format=json'\
21
-    + '&props=labels%7Cinfo%7Csitelinks'\
22
-    + '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\
23
-    + '&{query}'
32
+    + '?action=parse&format=json&{query}'\
33
+    + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\
34
+    + '&disableeditsection=1&disabletidy=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'
35
+
24 36
 url_map = 'https://www.openstreetmap.org/'\
25 37
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
38
+url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
39
+
40
+# xpaths
41
+wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
42
+title_xpath = '//*[contains(@class,"wikibase-title-label")]'
43
+description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
44
+property_xpath = '//div[@id="{propertyid}"]'
45
+label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
46
+url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
47
+wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
48
+    + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
49
+property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
50
+preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
51
+value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
52
+    + '/*/div[contains(@class,"wikibase-snakview-value")]'
53
+language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
54
+calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
26 55
 
27 56
 
28 57
 def request(query, params):
58
+    language = params['language'].split('_')[0]
59
+    if language == 'all':
60
+        language = 'en'
61
+
29 62
     params['url'] = url_search.format(
30
-        query=urlencode({'srsearch': query,
31
-                        'srlimit': result_count}))
63
+        query=urlencode({'label': query,
64
+                        'language': language}))
32 65
     return params
33 66
 
34 67
 
35 68
 def response(resp):
36 69
     results = []
37
-    search_res = json.loads(resp.text)
38
-
39
-    wikidata_ids = set()
40
-    for r in search_res.get('query', {}).get('search', {}):
41
-        wikidata_ids.add(r.get('title', ''))
70
+    html = fromstring(resp.content)
71
+    wikidata_ids = html.xpath(wikidata_ids_xpath)
42 72
 
43 73
     language = resp.search_params['language'].split('_')[0]
44 74
     if language == 'all':
45 75
         language = 'en'
46 76
 
47
-    url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids),
48
-                                            'languages': language + '|en'}))
49
-
50
-    htmlresponse = get(url)
51
-    jsonresponse = json.loads(htmlresponse.content)
52
-    for wikidata_id in wikidata_ids:
53
-        results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
77
+    # TODO: make requests asynchronous to avoid timeout when result_count > 1
78
+    for wikidata_id in wikidata_ids[:result_count]:
79
+        url = url_detail.format(query=urlencode({'page': wikidata_id,
80
+                                                'uselang': language}))
81
+        htmlresponse = get(url)
82
+        jsonresponse = loads(htmlresponse.content)
83
+        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
54 84
 
55 85
     return results
56 86
 
@@ -60,124 +90,206 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
60 90
     urls = []
61 91
     attributes = []
62 92
 
63
-    result = jsonresponse.get('entities', {}).get(wikidata_id, {})
93
+    title = jsonresponse.get('parse', {}).get('displaytitle', {})
94
+    result = jsonresponse.get('parse', {}).get('text', {})
64 95
 
65
-    title = result.get('labels', {}).get(language, {}).get('value', None)
66
-    if title is None:
67
-        title = result.get('labels', {}).get('en', {}).get('value', None)
68
-    if title is None:
96
+    if not title or not result:
69 97
         return results
70 98
 
71
-    description = result\
72
-        .get('descriptions', {})\
73
-        .get(language, {})\
74
-        .get('value', None)
99
+    title = fromstring(title)
100
+    for elem in title.xpath(language_fallback_xpath):
101
+        elem.getparent().remove(elem)
102
+    title = extract_text(title.xpath(title_xpath))
75 103
 
76
-    if description is None:
77
-        description = result\
78
-            .get('descriptions', {})\
79
-            .get('en', {})\
80
-            .get('value', '')
104
+    result = fromstring(result)
105
+    for elem in result.xpath(language_fallback_xpath):
106
+        elem.getparent().remove(elem)
81 107
 
82
-    claims = result.get('claims', {})
83
-    official_website = get_string(claims, 'P856', None)
84
-    if official_website is not None:
85
-        urls.append({'title': 'Official site', 'url': official_website})
86
-        results.append({'title': title, 'url': official_website})
108
+    description = extract_text(result.xpath(description_xpath))
87 109
 
88
-    wikipedia_link_count = 0
89
-    wikipedia_link = get_wikilink(result, language + 'wiki')
90
-    wikipedia_link_count += add_url(urls,
91
-                                    'Wikipedia (' + language + ')',
92
-                                    wikipedia_link)
93
-    if language != 'en':
94
-        wikipedia_en_link = get_wikilink(result, 'enwiki')
95
-        wikipedia_link_count += add_url(urls,
96
-                                        'Wikipedia (en)',
97
-                                        wikipedia_en_link)
98
-    if wikipedia_link_count == 0:
99
-        misc_language = get_wiki_firstlanguage(result, 'wiki')
100
-        if misc_language is not None:
101
-            add_url(urls,
102
-                    'Wikipedia (' + misc_language + ')',
103
-                    get_wikilink(result, misc_language + 'wiki'))
110
+    # URLS
104 111
 
105
-    if language != 'en':
106
-        add_url(urls,
107
-                'Wiki voyage (' + language + ')',
108
-                get_wikilink(result, language + 'wikivoyage'))
112
+    # official website
113
+    add_url(urls, result, 'P856', results=results)
109 114
 
110
-    add_url(urls,
111
-            'Wiki voyage (en)',
112
-            get_wikilink(result, 'enwikivoyage'))
115
+    # wikipedia
116
+    wikipedia_link_count = 0
117
+    wikipedia_link = get_wikilink(result, language + 'wiki')
118
+    if wikipedia_link:
119
+        wikipedia_link_count += 1
120
+        urls.append({'title': 'Wikipedia (' + language + ')',
121
+                     'url': wikipedia_link})
113 122
 
114 123
     if language != 'en':
115
-        add_url(urls,
116
-                'Wikiquote (' + language + ')',
117
-                get_wikilink(result, language + 'wikiquote'))
118
-
119
-    add_url(urls,
120
-            'Wikiquote (en)',
121
-            get_wikilink(result, 'enwikiquote'))
122
-
123
-    add_url(urls,
124
-            'Commons wiki',
125
-            get_wikilink(result, 'commonswiki'))
126
-
127
-    add_url(urls,
128
-            'Location',
129
-            get_geolink(claims, 'P625', None))
130
-
131
-    add_url(urls,
132
-            'Wikidata',
133
-            'https://www.wikidata.org/wiki/'
134
-            + wikidata_id + '?uselang=' + language)
135
-
136
-    musicbrainz_work_id = get_string(claims, 'P435')
137
-    if musicbrainz_work_id is not None:
138
-        add_url(urls,
139
-                'MusicBrainz',
140
-                'http://musicbrainz.org/work/'
141
-                + musicbrainz_work_id)
142
-
143
-    musicbrainz_artist_id = get_string(claims, 'P434')
144
-    if musicbrainz_artist_id is not None:
145
-        add_url(urls,
146
-                'MusicBrainz',
147
-                'http://musicbrainz.org/artist/'
148
-                + musicbrainz_artist_id)
149
-
150
-    musicbrainz_release_group_id = get_string(claims, 'P436')
151
-    if musicbrainz_release_group_id is not None:
152
-        add_url(urls,
153
-                'MusicBrainz',
154
-                'http://musicbrainz.org/release-group/'
155
-                + musicbrainz_release_group_id)
156
-
157
-    musicbrainz_label_id = get_string(claims, 'P966')
158
-    if musicbrainz_label_id is not None:
159
-        add_url(urls,
160
-                'MusicBrainz',
161
-                'http://musicbrainz.org/label/'
162
-                + musicbrainz_label_id)
163
-
164
-    # musicbrainz_area_id = get_string(claims, 'P982')
165
-    # P1407 MusicBrainz series ID
166
-    # P1004 MusicBrainz place ID
167
-    # P1330 MusicBrainz instrument ID
168
-    # P1407 MusicBrainz series ID
169
-
170
-    postal_code = get_string(claims, 'P281', None)
171
-    if postal_code is not None:
172
-        attributes.append({'label': 'Postal code(s)', 'value': postal_code})
173
-
174
-    date_of_birth = get_time(claims, 'P569', locale, None)
175
-    if date_of_birth is not None:
176
-        attributes.append({'label': 'Date of birth', 'value': date_of_birth})
177
-
178
-    date_of_death = get_time(claims, 'P570', locale, None)
179
-    if date_of_death is not None:
180
-        attributes.append({'label': 'Date of death', 'value': date_of_death})
124
+        wikipedia_en_link = get_wikilink(result, 'enwiki')
125
+        if wikipedia_en_link:
126
+            wikipedia_link_count += 1
127
+            urls.append({'title': 'Wikipedia (en)',
128
+                         'url': wikipedia_en_link})
129
+
130
+    # TODO: get_wiki_firstlanguage
131
+    # if wikipedia_link_count == 0:
132
+
133
+    # more wikis
134
+    add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
135
+    add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
136
+    add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki')
137
+
138
+    add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo')
139
+
140
+    # musicbrainz
141
+    add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
142
+    add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
143
+    add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
144
+    add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')
145
+
146
+    # IMDb
147
+    add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
148
+    # source code repository
149
+    add_url(urls, result, 'P1324')
150
+    # blog
151
+    add_url(urls, result, 'P1581')
152
+    # social media links
153
+    add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
154
+    add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
155
+    add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/')
156
+    add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/')
157
+    add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/')
158
+
159
+    urls.append({'title': 'Wikidata',
160
+                 'url': 'https://www.wikidata.org/wiki/'
161
+                 + wikidata_id + '?uselang=' + language})
162
+
163
+    # INFOBOX ATTRIBUTES (ROWS)
164
+
165
+    # DATES
166
+    # inception date
167
+    add_attribute(attributes, result, 'P571', date=True)
168
+    # dissolution date
169
+    add_attribute(attributes, result, 'P576', date=True)
170
+    # start date
171
+    add_attribute(attributes, result, 'P580', date=True)
172
+    # end date
173
+    add_attribute(attributes, result, 'P582', date=True)
174
+    # date of birth
175
+    add_attribute(attributes, result, 'P569', date=True)
176
+    # date of death
177
+    add_attribute(attributes, result, 'P570', date=True)
178
+    # date of spacecraft launch
179
+    add_attribute(attributes, result, 'P619', date=True)
180
+    # date of spacecraft landing
181
+    add_attribute(attributes, result, 'P620', date=True)
182
+
183
+    # nationality
184
+    add_attribute(attributes, result, 'P27')
185
+    # country of origin
186
+    add_attribute(attributes, result, 'P495')
187
+    # country
188
+    add_attribute(attributes, result, 'P17')
189
+    # headquarters
190
+    add_attribute(attributes, result, 'Q180')
191
+
192
+    # PLACES
193
+    # capital
194
+    add_attribute(attributes, result, 'P36', trim=True)
195
+    # head of state
196
+    add_attribute(attributes, result, 'P35', trim=True)
197
+    # head of government
198
+    add_attribute(attributes, result, 'P6', trim=True)
199
+    # type of government
200
+    add_attribute(attributes, result, 'P122')
201
+    # official language
202
+    add_attribute(attributes, result, 'P37')
203
+    # population
204
+    add_attribute(attributes, result, 'P1082', trim=True)
205
+    # area
206
+    add_attribute(attributes, result, 'P2046')
207
+    # currency
208
+    add_attribute(attributes, result, 'P38', trim=True)
209
+    # heigth (building)
210
+    add_attribute(attributes, result, 'P2048')
211
+
212
+    # MEDIA
213
+    # platform (videogames)
214
+    add_attribute(attributes, result, 'P400')
215
+    # author
216
+    add_attribute(attributes, result, 'P50')
217
+    # creator
218
+    add_attribute(attributes, result, 'P170')
219
+    # director
220
+    add_attribute(attributes, result, 'P57')
221
+    # performer
222
+    add_attribute(attributes, result, 'P175')
223
+    # developer
224
+    add_attribute(attributes, result, 'P178')
225
+    # producer
226
+    add_attribute(attributes, result, 'P162')
227
+    # manufacturer
228
+    add_attribute(attributes, result, 'P176')
229
+    # screenwriter
230
+    add_attribute(attributes, result, 'P58')
231
+    # production company
232
+    add_attribute(attributes, result, 'P272')
233
+    # record label
234
+    add_attribute(attributes, result, 'P264')
235
+    # publisher
236
+    add_attribute(attributes, result, 'P123')
237
+    # original network
238
+    add_attribute(attributes, result, 'P449')
239
+    # distributor
240
+    add_attribute(attributes, result, 'P750')
241
+    # composer
242
+    add_attribute(attributes, result, 'P86')
243
+    # publication date
244
+    add_attribute(attributes, result, 'P577', date=True)
245
+    # genre
246
+    add_attribute(attributes, result, 'P136')
247
+    # original language
248
+    add_attribute(attributes, result, 'P364')
249
+    # isbn
250
+    add_attribute(attributes, result, 'Q33057')
251
+    # software license
252
+    add_attribute(attributes, result, 'P275')
253
+    # programming language
254
+    add_attribute(attributes, result, 'P277')
255
+    # version
256
+    add_attribute(attributes, result, 'P348', trim=True)
257
+    # narrative location
258
+    add_attribute(attributes, result, 'P840')
259
+
260
+    # LANGUAGES
261
+    # number of speakers
262
+    add_attribute(attributes, result, 'P1098')
263
+    # writing system
264
+    add_attribute(attributes, result, 'P282')
265
+    # regulatory body
266
+    add_attribute(attributes, result, 'P1018')
267
+    # language code
268
+    add_attribute(attributes, result, 'P218')
269
+
270
+    # OTHER
271
+    # ceo
272
+    add_attribute(attributes, result, 'P169', trim=True)
273
+    # founder
274
+    add_attribute(attributes, result, 'P112')
275
+    # legal form (company/organization)
276
+    add_attribute(attributes, result, 'P1454')
277
+    # operator
278
+    add_attribute(attributes, result, 'P137')
279
+    # crew members (tripulation)
280
+    add_attribute(attributes, result, 'P1029')
281
+    # taxon
282
+    add_attribute(attributes, result, 'P225')
283
+    # chemical formula
284
+    add_attribute(attributes, result, 'P274')
285
+    # winner (sports/contests)
286
+    add_attribute(attributes, result, 'P1346')
287
+    # number of deaths
288
+    add_attribute(attributes, result, 'P1120')
289
+    # currency code
290
+    add_attribute(attributes, result, 'P498')
291
+
292
+    image = add_image(result)
181 293
 
182 294
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
183 295
         results.append({
@@ -190,6 +302,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
190 302
                        'infobox': title,
191 303
                        'id': wikipedia_link,
192 304
                        'content': description,
305
+                       'img_src': image,
193 306
                        'attributes': attributes,
194 307
                        'urls': urls
195 308
                        })
@@ -197,92 +310,151 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
197 310
     return results
198 311
 
199 312
 
200
-def add_url(urls, title, url):
201
-    if url is not None:
202
-        urls.append({'title': title, 'url': url})
203
-        return 1
313
+# only returns first match
314
+def add_image(result):
315
+    # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
316
+    property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
317
+
318
+    for property_id in property_ids:
319
+        image = result.xpath(property_xpath.replace('{propertyid}', property_id))
320
+        if image:
321
+            image_name = image[0].xpath(value_xpath)
322
+            image_src = url_image.replace('{filename}', extract_text(image_name[0]))
323
+            return image_src
324
+
325
+
326
+# setting trim will only returned high ranked rows OR the first row
327
+def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False):
328
+    attribute = result.xpath(property_xpath.replace('{propertyid}', property_id))
329
+    if attribute:
330
+
331
+        if default_label:
332
+            label = default_label
333
+        else:
334
+            label = extract_text(attribute[0].xpath(label_xpath))
335
+            label = label[0].upper() + label[1:]
336
+
337
+        if date:
338
+            trim = True
339
+            # remove calendar name
340
+            calendar_name = attribute[0].xpath(calendar_name_xpath)
341
+            for calendar in calendar_name:
342
+                calendar.getparent().remove(calendar)
343
+
344
+        concat_values = ""
345
+        values = []
346
+        first_value = None
347
+        for row in attribute[0].xpath(property_row_xpath):
348
+            if not first_value or not trim or row.xpath(preferred_rank_xpath):
349
+
350
+                value = row.xpath(value_xpath)
351
+                if not value:
352
+                    continue
353
+                value = extract_text(value)
354
+
355
+                # save first value in case no ranked row is found
356
+                if trim and not first_value:
357
+                    first_value = value
358
+                else:
359
+                    # to avoid duplicate values
360
+                    if value not in values:
361
+                        concat_values += value + ", "
362
+                        values.append(value)
363
+
364
+        if trim and not values:
365
+            attributes.append({'label': label,
366
+                               'value': first_value})
367
+        else:
368
+            attributes.append({'label': label,
369
+                               'value': concat_values[:-2]})
370
+
371
+
372
+# requires property_id unless it's a wiki link (defined in link_type)
373
+def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None):
374
+    links = []
375
+
376
+    # wiki links don't have property in wikidata page
377
+    if link_type and 'wiki' in link_type:
378
+            links.append(get_wikilink(result, link_type))
204 379
     else:
205
-        return 0
380
+        dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id))
381
+        if dom_element:
382
+            dom_element = dom_element[0]
383
+            if not default_label:
384
+                label = extract_text(dom_element.xpath(label_xpath))
385
+                label = label[0].upper() + label[1:]
386
+
387
+            if link_type == 'geo':
388
+                links.append(get_geolink(dom_element))
389
+
390
+            elif link_type == 'imdb':
391
+                links.append(get_imdblink(dom_element, url_prefix))
392
+
393
+            else:
394
+                url_results = dom_element.xpath(url_xpath)
395
+                for link in url_results:
396
+                    if link is not None:
397
+                        if url_prefix:
398
+                            link = url_prefix + extract_text(link)
399
+                        else:
400
+                            link = extract_text(link)
401
+                        links.append(link)
402
+
403
+    # append urls
404
+    for url in links:
405
+        if url is not None:
406
+            urls.append({'title': default_label or label,
407
+                         'url': url})
408
+            if results is not None:
409
+                results.append({'title': default_label or label,
410
+                                'url': url})
411
+
412
+
413
+def get_imdblink(result, url_prefix):
414
+    imdb_id = result.xpath(value_xpath)
415
+    if imdb_id:
416
+        imdb_id = extract_text(imdb_id)
417
+        id_prefix = imdb_id[:2]
418
+        if id_prefix == 'tt':
419
+            url = url_prefix + 'title/' + imdb_id
420
+        elif id_prefix == 'nm':
421
+            url = url_prefix + 'name/' + imdb_id
422
+        elif id_prefix == 'ch':
423
+            url = url_prefix + 'character/' + imdb_id
424
+        elif id_prefix == 'co':
425
+            url = url_prefix + 'company/' + imdb_id
426
+        elif id_prefix == 'ev':
427
+            url = url_prefix + 'event/' + imdb_id
428
+        else:
429
+            url = None
430
+        return url
206 431
 
207 432
 
208
-def get_mainsnak(claims, propertyName):
209
-    propValue = claims.get(propertyName, {})
210
-    if len(propValue) == 0:
433
+def get_geolink(result):
434
+    coordinates = result.xpath(value_xpath)
435
+    if not coordinates:
211 436
         return None
212
-
213
-    propValue = propValue[0].get('mainsnak', None)
214
-    return propValue
215
-
216
-
217
-def get_string(claims, propertyName, defaultValue=None):
218
-    propValue = claims.get(propertyName, {})
219
-    if len(propValue) == 0:
220
-        return defaultValue
221
-
222
-    result = []
223
-    for e in propValue:
224
-        mainsnak = e.get('mainsnak', {})
225
-
226
-        datavalue = mainsnak.get('datavalue', {})
227
-        if datavalue is not None:
228
-            result.append(datavalue.get('value', ''))
229
-
230
-    if len(result) == 0:
231
-        return defaultValue
232
-    else:
233
-        # TODO handle multiple urls
234
-        return result[0]
235
-
236
-
237
-def get_time(claims, propertyName, locale, defaultValue=None):
238
-    propValue = claims.get(propertyName, {})
239
-    if len(propValue) == 0:
240
-        return defaultValue
241
-
242
-    result = []
243
-    for e in propValue:
244
-        mainsnak = e.get('mainsnak', {})
245
-
246
-        datavalue = mainsnak.get('datavalue', {})
247
-        if datavalue is not None:
248
-            value = datavalue.get('value', '')
249
-            result.append(value.get('time', ''))
250
-
251
-    if len(result) == 0:
252
-        date_string = defaultValue
253
-    else:
254
-        date_string = ', '.join(result)
255
-
256
-    try:
257
-        parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
258
-    except:
259
-        if date_string.startswith('-'):
260
-            return date_string.split('T')[0]
261
-        try:
262
-            parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
263
-        except:
264
-            logger.debug('could not parse date %s', date_string)
265
-            return date_string.split('T')[0]
266
-
267
-    return format_date_by_locale(parsed_date, locale)
268
-
269
-
270
-def get_geolink(claims, propertyName, defaultValue=''):
271
-    mainsnak = get_mainsnak(claims, propertyName)
272
-
273
-    if mainsnak is None:
274
-        return defaultValue
275
-
276
-    datatype = mainsnak.get('datatype', '')
277
-    datavalue = mainsnak.get('datavalue', {})
278
-
279
-    if datatype != 'globe-coordinate':
280
-        return defaultValue
281
-
282
-    value = datavalue.get('value', {})
283
-
284
-    precision = value.get('precision', 0.0002)
285
-
437
+    coordinates = extract_text(coordinates[0])
438
+    latitude, longitude = coordinates.split(',')
439
+
440
+    # convert to decimal
441
+    lat = int(latitude[:latitude.find(u'°')])
442
+    if latitude.find('\'') >= 0:
443
+        lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0
444
+    if latitude.find('"') >= 0:
445
+        lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
446
+    if latitude.find('S') >= 0:
447
+        lat *= -1
448
+    lon = int(longitude[:longitude.find(u'°')])
449
+    if longitude.find('\'') >= 0:
450
+        lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0
451
+    if longitude.find('"') >= 0:
452
+        lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
453
+    if longitude.find('W') >= 0:
454
+        lon *= -1
455
+
456
+    # TODO: get precision
457
+    precision = 0.0002
286 458
     # there is no zoom information, deduce from precision (error prone)
287 459
     # samples :
288 460
     # 13 --> 5
@@ -298,26 +470,20 @@ def get_geolink(claims, propertyName, defaultValue=''):
298 470
         zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
299 471
 
300 472
     url = url_map\
301
-        .replace('{latitude}', str(value.get('latitude', 0)))\
302
-        .replace('{longitude}', str(value.get('longitude', 0)))\
473
+        .replace('{latitude}', str(lat))\
474
+        .replace('{longitude}', str(lon))\
303 475
         .replace('{zoom}', str(zoom))
304 476
 
305 477
     return url
306 478
 
307 479
 
308 480
 def get_wikilink(result, wikiid):
309
-    url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None)
310
-    if url is None:
311
-        return url
312
-    elif url.startswith('http://'):
481
+    url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid))
482
+    if not url:
483
+        return None
484
+    url = url[0]
485
+    if url.startswith('http://'):
313 486
         url = url.replace('http://', 'https://')
314 487
     elif url.startswith('//'):
315 488
         url = 'https:' + url
316 489
     return url
317
-
318
-
319
-def get_wiki_firstlanguage(result, wikipatternid):
320
-    for k in result.get('sitelinks', {}).keys():
321
-        if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
322
-            return k[0:2]
323
-    return None

+ 1
- 2
searx/engines/wikipedia.py View File

@@ -99,9 +99,8 @@ def response(resp):
99 99
         return []
100 100
 
101 101
     # link to wikipedia article
102
-    # parenthesis are not quoted to make infobox mergeable with wikidata's
103 102
     wikipedia_link = url_lang(resp.search_params['language']) \
104
-        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
103
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
105 104
 
106 105
     results.append({'url': wikipedia_link, 'title': title})
107 106
 

+ 41
- 20
searx/results.py View File

@@ -18,7 +18,17 @@ def result_content_len(content):
18 18
 
19 19
 
20 20
 def compare_urls(url_a, url_b):
21
-    if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
21
+    # ignore www. in comparison
22
+    if url_a.netloc.startswith('www.'):
23
+        host_a = url_a.netloc.replace('www.', '', 1)
24
+    else:
25
+        host_a = url_a.netloc
26
+    if url_b.netloc.startswith('www.'):
27
+        host_b = url_b.netloc.replace('www.', '', 1)
28
+    else:
29
+        host_b = url_b.netloc
30
+
31
+    if host_a != host_b or url_a.query != url_b.query:
22 32
         return False
23 33
 
24 34
     # remove / from the end of the url if required
@@ -33,25 +43,42 @@ def compare_urls(url_a, url_b):
33 43
 
34 44
 
35 45
 def merge_two_infoboxes(infobox1, infobox2):
46
+    # get engines weights
47
+    if hasattr(engines[infobox1['engine']], 'weight'):
48
+        weight1 = engines[infobox1['engine']].weight
49
+    else:
50
+        weight1 = 1
51
+    if hasattr(engines[infobox2['engine']], 'weight'):
52
+        weight2 = engines[infobox2['engine']].weight
53
+    else:
54
+        weight2 = 1
55
+
56
+    if weight2 > weight1:
57
+        infobox1['engine'] = infobox2['engine']
58
+
36 59
     if 'urls' in infobox2:
37 60
         urls1 = infobox1.get('urls', None)
38 61
         if urls1 is None:
39 62
             urls1 = []
40
-            infobox1['urls'] = urls1
41 63
 
42
-        urlSet = set()
43
-        for url in infobox1.get('urls', []):
44
-            urlSet.add(url.get('url', None))
64
+        for url2 in infobox2.get('urls', []):
65
+            unique_url = True
66
+            for url1 in infobox1.get('urls', []):
67
+                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
68
+                    unique_url = False
69
+                    break
70
+            if unique_url:
71
+                urls1.append(url2)
45 72
 
46
-        for url in infobox2.get('urls', []):
47
-            if url.get('url', None) not in urlSet:
48
-                urls1.append(url)
73
+        infobox1['urls'] = urls1
49 74
 
50 75
     if 'img_src' in infobox2:
51 76
         img1 = infobox1.get('img_src', None)
52 77
         img2 = infobox2.get('img_src')
53 78
         if img1 is None:
54 79
             infobox1['img_src'] = img2
80
+        elif weight2 > weight1:
81
+            infobox1['img_src'] = img2
55 82
 
56 83
     if 'attributes' in infobox2:
57 84
         attributes1 = infobox1.get('attributes', None)
@@ -65,7 +92,8 @@ def merge_two_infoboxes(infobox1, infobox2):
65 92
                 attributeSet.add(attribute.get('label', None))
66 93
 
67 94
         for attribute in infobox2.get('attributes', []):
68
-            attributes1.append(attribute)
95
+            if attribute.get('label', None) not in attributeSet:
96
+                attributes1.append(attribute)
69 97
 
70 98
     if 'content' in infobox2:
71 99
         content1 = infobox1.get('content', None)
@@ -97,7 +125,6 @@ class ResultContainer(object):
97 125
         self.results = defaultdict(list)
98 126
         self._merged_results = []
99 127
         self.infoboxes = []
100
-        self._infobox_ids = {}
101 128
         self.suggestions = set()
102 129
         self.answers = set()
103 130
         self._number_of_results = []
@@ -138,14 +165,13 @@ class ResultContainer(object):
138 165
         add_infobox = True
139 166
         infobox_id = infobox.get('id', None)
140 167
         if infobox_id is not None:
141
-            existingIndex = self._infobox_ids.get(infobox_id, None)
142
-            if existingIndex is not None:
143
-                merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
144
-                add_infobox = False
168
+            for existingIndex in self.infoboxes:
169
+                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
170
+                    merge_two_infoboxes(existingIndex, infobox)
171
+                    add_infobox = False
145 172
 
146 173
         if add_infobox:
147 174
             self.infoboxes.append(infobox)
148
-            self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
149 175
 
150 176
     def _merge_result(self, result, position):
151 177
         result['parsed_url'] = urlparse(result['url'])
@@ -155,11 +181,6 @@ class ResultContainer(object):
155 181
             result['parsed_url'] = result['parsed_url']._replace(scheme="http")
156 182
             result['url'] = result['parsed_url'].geturl()
157 183
 
158
-        result['host'] = result['parsed_url'].netloc
159
-
160
-        if result['host'].startswith('www.'):
161
-            result['host'] = result['host'].replace('www.', '', 1)
162
-
163 184
         result['engines'] = [result['engine']]
164 185
 
165 186
         # strip multiple spaces and cariage returns from content

+ 2
- 0
searx/settings.yml View File

@@ -105,6 +105,7 @@ engines:
105 105
   - name : ddg definitions
106 106
     engine : duckduckgo_definitions
107 107
     shortcut : ddd
108
+    weight : 2
108 109
     disabled : True
109 110
 
110 111
   - name : digg
@@ -127,6 +128,7 @@ engines:
127 128
   - name : wikidata
128 129
     engine : wikidata
129 130
     shortcut : wd
131
+    weight : 2
130 132
 
131 133
   - name : duckduckgo
132 134
     engine : duckduckgo

+ 7
- 7
searx/templates/default/infobox.html View File

@@ -1,18 +1,18 @@
1 1
 <div class="infobox">
2
-    <h2>{{ infobox.infobox }}</h2>
2
+<h2><bdi>{{ infobox.infobox }}</bdi></h2>
3 3
     {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %}
4
-    <p>{{ infobox.entity }}</p>
5
-    <p>{{ infobox.content | safe }}</p>
4
+    <p><bdi>{{ infobox.entity }}</bdi></p>
5
+    <p><bdi>{{ infobox.content | safe }}</bdi></p>
6 6
     {% if infobox.attributes %}
7 7
     <div class="attributes">
8 8
         <table>
9 9
             {% for attribute in infobox.attributes %}
10 10
             <tr>
11
-                <td>{{ attribute.label }}</td>
11
+                <td><bdi>{{ attribute.label }}</bdi></td>
12 12
                 {% if attribute.image %}
13 13
                 <td><img src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
14 14
                 {% else %}
15
-                <td>{{ attribute.value }}</td>
15
+                <td><bdi>{{ attribute.value }}</bdi></td>
16 16
                 {% endif %}
17 17
             </tr>
18 18
             {% endfor %}
@@ -24,7 +24,7 @@
24 24
     <div class="urls">
25 25
         <ul>
26 26
             {% for url in infobox.urls %}
27
-            <li class="url"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></li>
27
+            <li class="url"><bdi><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></bdi></li>
28 28
             {% endfor %}
29 29
         </ul>
30 30
     </div>
@@ -34,7 +34,7 @@
34 34
     <div class="relatedTopics">
35 35
         {% for topic in infobox.relatedTopics %}
36 36
         <div>
37
-            <h3>{{ topic.name }}</h3>
37
+            <h3><bdi>{{ topic.name }}</bdi></h3>
38 38
             {% for suggestion in topic.suggestions %}
39 39
             <form method="{{ method or 'POST' }}" action="{{ url_for('index') }}">
40 40
                 <input type="hidden" name="q" value="{{ suggestion }}">

+ 6
- 6
searx/templates/oscar/infobox.html View File

@@ -1,21 +1,20 @@
1 1
 <div class="panel panel-default infobox">
2 2
     <div class="panel-heading">
3
-        <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
3
+        <h4 class="panel-title infobox_part"><bdi>{{ infobox.infobox }}</bdi></h4>
4 4
     </div>
5 5
     <div class="panel-body">
6
-        <bdi>
7 6
         {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
8
-        {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
7
+        {% if infobox.content %}<bdi><p class="infobox_part">{{ infobox.content }}</bdi></p>{% endif %}
9 8
 
10 9
         {% if infobox.attributes %}
11 10
         <table class="table table-striped infobox_part">
12 11
             {% for attribute in infobox.attributes %}
13 12
             <tr>
14
-                <td>{{ attribute.label }}</td>
13
+                <td><bdi>{{ attribute.label }}</bdi></td>
15 14
                 {% if attribute.image %}
16 15
                 <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
17 16
                 {% else %}
18
-                <td>{{ attribute.value }}</td>
17
+                <td><bdi>{{ attribute.value }}</bdi></td>
19 18
                 {% endif %}
20 19
             </tr>
21 20
             {% endfor %}
@@ -24,11 +23,12 @@
24 23
 
25 24
         {% if infobox.urls %}
26 25
         <div class="infobox_part">
26
+            <bdi>
27 27
             {% for url in infobox.urls %}
28 28
             <p class="btn btn-default btn-xs"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></p>
29 29
             {% endfor %}
30
+            </bdi>
30 31
         </div>
31 32
         {% endif %}
32
-        </bdi>
33 33
     </div>
34 34
 </div>

+ 7
- 1
searx/utils.py View File

@@ -206,7 +206,13 @@ def format_date_by_locale(date, locale_string):
206 206
     if locale_string == 'all':
207 207
         locale_string = settings['ui']['default_locale'] or 'en_US'
208 208
 
209
-    return format_date(date, locale=locale_string)
209
+    # to avoid crashing if locale is not supported by babel
210
+    try:
211
+        formatted_date = format_date(date, locale=locale_string)
212
+    except:
213
+        formatted_date = format_date(date, "YYYY-MM-dd")
214
+
215
+    return formatted_date
210 216
 
211 217
 
212 218
 def dict_subset(d, properties):

+ 504
- 0
tests/unit/engines/test_wikidata.py View File

@@ -0,0 +1,504 @@
1
+# -*- coding: utf-8 -*-
2
+from json import loads
3
+from lxml.html import fromstring
4
+from collections import defaultdict
5
+import mock
6
+from searx.engines import wikidata
7
+from searx.testing import SearxTestCase
8
+
9
+
10
+class TestWikidataEngine(SearxTestCase):
11
+
12
+    def test_request(self):
13
+        query = 'test_query'
14
+        dicto = defaultdict(dict)
15
+        dicto['language'] = 'all'
16
+        params = wikidata.request(query, dicto)
17
+        self.assertIn('url', params)
18
+        self.assertIn(query, params['url'])
19
+        self.assertIn('wikidata.org', params['url'])
20
+        self.assertIn('en', params['url'])
21
+
22
+        dicto['language'] = 'es_ES'
23
+        params = wikidata.request(query, dicto)
24
+        self.assertIn(query, params['url'])
25
+        self.assertIn('es', params['url'])
26
+
27
+    # successful cases are not tested here to avoid sending additional requests
28
+    def test_response(self):
29
+        self.assertRaises(AttributeError, wikidata.response, None)
30
+        self.assertRaises(AttributeError, wikidata.response, [])
31
+        self.assertRaises(AttributeError, wikidata.response, '')
32
+        self.assertRaises(AttributeError, wikidata.response, '[]')
33
+
34
+        response = mock.Mock(content='<html></html>', search_params={"language": "all"})
35
+        self.assertEqual(wikidata.response(response), [])
36
+
37
+    def test_getDetail(self):
38
+        response = {}
39
+        results = wikidata.getDetail(response, "Q123", "en", "en-US")
40
+        self.assertEqual(results, [])
41
+
42
+        title_html = '<div><div class="wikibase-title-label">Test</div></div>'
43
+        html = """
44
+        <div>
45
+            <div class="wikibase-entitytermsview-heading-description">
46
+            </div>
47
+            <div>
48
+                <ul class="wikibase-sitelinklistview-listview">
49
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
50
+                </ul>
51
+            </div>
52
+        </div>
53
+        """
54
+        response = {"parse": {"displaytitle": title_html, "text": html}}
55
+
56
+        results = wikidata.getDetail(response, "Q123", "en", "en-US")
57
+        self.assertEqual(len(results), 1)
58
+        self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test')
59
+
60
+        title_html = """
61
+        <div>
62
+            <div class="wikibase-title-label">
63
+                <span lang="en">Test</span>
64
+                <sup class="wb-language-fallback-indicator">English</sup>
65
+            </div>
66
+        </div>
67
+        """
68
+        html = """
69
+        <div>
70
+            <div class="wikibase-entitytermsview-heading-description">
71
+                <span lang="en">Description</span>
72
+                <sup class="wb-language-fallback-indicator">English</sup>
73
+            </div>
74
+            <div id="P856">
75
+                <div class="wikibase-statementgroupview-property-label">
76
+                    <a href="/wiki/Property:P856">
77
+                        <span lang="en">official website</span>
78
+                        <sup class="wb-language-fallback-indicator">English</sup>
79
+                    </a>
80
+                </div>
81
+                <div class="wikibase-statementview-mainsnak">
82
+                    <a class="external free" href="https://officialsite.com">
83
+                        https://officialsite.com
84
+                    </a>
85
+                </div>
86
+            </div>
87
+            <div>
88
+                <ul class="wikibase-sitelinklistview-listview">
89
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
90
+                </ul>
91
+            </div>
92
+        </div>
93
+        """
94
+        response = {"parse": {"displaytitle": title_html, "text": html}}
95
+
96
+        results = wikidata.getDetail(response, "Q123", "yua", "yua_MX")
97
+        self.assertEqual(len(results), 2)
98
+        self.assertEqual(results[0]['title'], 'Official website')
99
+        self.assertEqual(results[0]['url'], 'https://officialsite.com')
100
+
101
+        self.assertEqual(results[1]['infobox'], 'Test')
102
+        self.assertEqual(results[1]['id'], None)
103
+        self.assertEqual(results[1]['content'], 'Description')
104
+        self.assertEqual(results[1]['attributes'], [])
105
+        self.assertEqual(results[1]['urls'][0]['title'], 'Official website')
106
+        self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com')
107
+        self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)')
108
+        self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test')
109
+
110
+    def test_add_image(self):
111
+        image_src = wikidata.add_image(fromstring("<div></div>"))
112
+        self.assertEqual(image_src, None)
113
+
114
+        html = u"""
115
+        <div>
116
+            <div id="P18">
117
+                <div class="wikibase-statementgroupview-property-label">
118
+                    <a href="/wiki/Property:P18">
119
+                        image
120
+                    </a>
121
+                </div>
122
+                <div class="wikibase-statementlistview">
123
+                    <div class="wikibase-statementview listview-item">
124
+                        <div class="wikibase-statementview-rankselector">
125
+                            <span class="wikibase-rankselector-normal"></span>
126
+                        </div>
127
+                        <div class="wikibase-statementview-mainsnak">
128
+                            <div>
129
+                                <div class="wikibase-snakview-value">
130
+                                    <a href="https://commons.wikimedia.org/wiki/File:image.png">
131
+                                        image.png
132
+                                    </a>
133
+                                </div>
134
+                            </div>
135
+                        </div>
136
+                    </div>
137
+                </div>
138
+            </div>
139
+        </div>
140
+        """
141
+        html_etree = fromstring(html)
142
+
143
+        image_src = wikidata.add_image(html_etree)
144
+        self.assertEqual(image_src,
145
+                         "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400")
146
+
147
+        html = u"""
148
+        <div>
149
+            <div id="P2910">
150
+                <div class="wikibase-statementgroupview-property-label">
151
+                    <a href="/wiki/Property:P2910">
152
+                        icon
153
+                    </a>
154
+                </div>
155
+                <div class="wikibase-statementlistview">
156
+                    <div class="wikibase-statementview listview-item">
157
+                        <div class="wikibase-statementview-rankselector">
158
+                            <span class="wikibase-rankselector-normal"></span>
159
+                        </div>
160
+                        <div class="wikibase-statementview-mainsnak">
161
+                            <div>
162
+                                <div class="wikibase-snakview-value">
163
+                                    <a href="https://commons.wikimedia.org/wiki/File:icon.png">
164
+                                        icon.png
165
+                                    </a>
166
+                                </div>
167
+                            </div>
168
+                        </div>
169
+                    </div>
170
+                </div>
171
+            </div>
172
+            <div id="P154">
173
+                <div class="wikibase-statementgroupview-property-label">
174
+                    <a href="/wiki/Property:P154">
175
+                        logo
176
+                    </a>
177
+                </div>
178
+                <div class="wikibase-statementlistview">
179
+                    <div class="wikibase-statementview listview-item">
180
+                        <div class="wikibase-statementview-rankselector">
181
+                            <span class="wikibase-rankselector-normal"></span>
182
+                        </div>
183
+                        <div class="wikibase-statementview-mainsnak">
184
+                            <div>
185
+                                <div class="wikibase-snakview-value">
186
+                                    <a href="https://commons.wikimedia.org/wiki/File:logo.png">
187
+                                        logo.png
188
+                                    </a>
189
+                                </div>
190
+                            </div>
191
+                        </div>
192
+                    </div>
193
+                </div>
194
+            </div>
195
+        </div>
196
+        """
197
+        html_etree = fromstring(html)
198
+
199
+        image_src = wikidata.add_image(html_etree)
200
+        self.assertEqual(image_src,
201
+                         "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400")
202
+
203
+    def test_add_attribute(self):
204
+        html = u"""
205
+        <div>
206
+            <div id="P27">
207
+                <div class="wikibase-statementgroupview-property-label">
208
+                    <a href="/wiki/Property:P27">
209
+                        country of citizenship
210
+                    </a>
211
+                </div>
212
+                <div class="wikibase-statementlistview">
213
+                    <div class="wikibase-statementview listview-item">
214
+                        <div class="wikibase-statementview-rankselector">
215
+                            <span class="wikibase-rankselector-normal"></span>
216
+                        </div>
217
+                        <div class="wikibase-statementview-mainsnak">
218
+                            <div>
219
+                                <div class="wikibase-snakview-value">
220
+                                    <a href="/wiki/Q145">
221
+                                        United Kingdom
222
+                                    </a>
223
+                                </div>
224
+                            </div>
225
+                        </div>
226
+                    </div>
227
+                </div>
228
+            </div>
229
+        </div>
230
+        """
231
+        attributes = []
232
+        html_etree = fromstring(html)
233
+
234
+        wikidata.add_attribute(attributes, html_etree, "Fail")
235
+        self.assertEqual(attributes, [])
236
+
237
+        wikidata.add_attribute(attributes, html_etree, "P27")
238
+        self.assertEqual(len(attributes), 1)
239
+        self.assertEqual(attributes[0]["label"], "Country of citizenship")
240
+        self.assertEqual(attributes[0]["value"], "United Kingdom")
241
+
242
+        html = u"""
243
+        <div>
244
+            <div id="P569">
245
+                <div class="wikibase-statementgroupview-property-label">
246
+                    <a href="/wiki/Property:P569">
247
+                        date of birth
248
+                    </a>
249
+                </div>
250
+                <div class="wikibase-statementlistview">
251
+                    <div class="wikibase-statementview listview-item">
252
+                        <div class="wikibase-statementview-rankselector">
253
+                            <span class="wikibase-rankselector-normal"></span>
254
+                        </div>
255
+                        <div class="wikibase-statementview-mainsnak">
256
+                            <div>
257
+                                <div class="wikibase-snakview-value">
258
+                                    27 January 1832
259
+                                    <sup class="wb-calendar-name">
260
+                                        Gregorian
261
+                                    </sup>
262
+                                </div>
263
+                            </div>
264
+                        </div>
265
+                    </div>
266
+                </div>
267
+            </div>
268
+        </div>
269
+        """
270
+        attributes = []
271
+        html_etree = fromstring(html)
272
+        wikidata.add_attribute(attributes, html_etree, "P569", date=True)
273
+        self.assertEqual(len(attributes), 1)
274
+        self.assertEqual(attributes[0]["label"], "Date of birth")
275
+        self.assertEqual(attributes[0]["value"], "27 January 1832")
276
+
277
+        html = u"""
278
+        <div>
279
+            <div id="P6">
280
+                <div class="wikibase-statementgroupview-property-label">
281
+                    <a href="/wiki/Property:P27">
282
+                        head of government
283
+                    </a>
284
+                </div>
285
+                <div class="wikibase-statementlistview">
286
+                    <div class="wikibase-statementview listview-item">
287
+                        <div class="wikibase-statementview-rankselector">
288
+                            <span class="wikibase-rankselector-normal"></span>
289
+                        </div>
290
+                        <div class="wikibase-statementview-mainsnak">
291
+                            <div>
292
+                                <div class="wikibase-snakview-value">
293
+                                    <a href="/wiki/Q206">
294
+                                        Old Prime Minister
295
+                                    </a>
296
+                                </div>
297
+                            </div>
298
+                        </div>
299
+                    </div>
300
+                    <div class="wikibase-statementview listview-item">
301
+                        <div class="wikibase-statementview-rankselector">
302
+                            <span class="wikibase-rankselector-preferred"></span>
303
+                        </div>
304
+                        <div class="wikibase-statementview-mainsnak">
305
+                            <div>
306
+                                <div class="wikibase-snakview-value">
307
+                                    <a href="/wiki/Q3099714">
308
+                                        Actual Prime Minister
309
+                                    </a>
310
+                                </div>
311
+                            </div>
312
+                        </div>
313
+                    </div>
314
+                </div>
315
+            </div>
316
+        </div>
317
+        """
318
+        attributes = []
319
+        html_etree = fromstring(html)
320
+        wikidata.add_attribute(attributes, html_etree, "P6")
321
+        self.assertEqual(len(attributes), 1)
322
+        self.assertEqual(attributes[0]["label"], "Head of government")
323
+        self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
324
+
325
+        attributes = []
326
+        html_etree = fromstring(html)
327
+        wikidata.add_attribute(attributes, html_etree, "P6", trim=True)
328
+        self.assertEqual(len(attributes), 1)
329
+        self.assertEqual(attributes[0]["value"], "Actual Prime Minister")
330
+
331
+    def test_add_url(self):
332
+        html = u"""
333
+        <div>
334
+            <div id="P856">
335
+                <div class="wikibase-statementgroupview-property-label">
336
+                    <a href="/wiki/Property:P856">
337
+                        official website
338
+                    </a>
339
+                </div>
340
+                <div class="wikibase-statementlistview">
341
+                    <div class="wikibase-statementview listview-item">
342
+                        <div class="wikibase-statementview-mainsnak">
343
+                            <div>
344
+                                <div class="wikibase-snakview-value">
345
+                                    <a class="external free" href="https://searx.me">
346
+                                        https://searx.me/
347
+                                    </a>
348
+                                </div>
349
+                            </div>
350
+                        </div>
351
+                    </div>
352
+                </div>
353
+            </div>
354
+        </div>
355
+        """
356
+        urls = []
357
+        html_etree = fromstring(html)
358
+        wikidata.add_url(urls, html_etree, 'P856')
359
+        self.assertEquals(len(urls), 1)
360
+        self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls)
361
+        urls = []
362
+        results = []
363
+        wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results)
364
+        self.assertEquals(len(urls), 1)
365
+        self.assertEquals(len(results), 1)
366
+        self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls)
367
+        self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results)
368
+
369
+        html = u"""
370
+        <div>
371
+            <div id="P856">
372
+                <div class="wikibase-statementgroupview-property-label">
373
+                    <a href="/wiki/Property:P856">
374
+                        official website
375
+                    </a>
376
+                </div>
377
+                <div class="wikibase-statementlistview">
378
+                    <div class="wikibase-statementview listview-item">
379
+                        <div class="wikibase-statementview-mainsnak">
380
+                            <div>
381
+                                <div class="wikibase-snakview-value">
382
+                                    <a class="external free" href="http://www.worldofwarcraft.com">
383
+                                        http://www.worldofwarcraft.com
384
+                                    </a>
385
+                                </div>
386
+                            </div>
387
+                        </div>
388
+                    </div>
389
+                    <div class="wikibase-statementview listview-item">
390
+                        <div class="wikibase-statementview-mainsnak">
391
+                            <div>
392
+                                <div class="wikibase-snakview-value">
393
+                                    <a class="external free" href="http://eu.battle.net/wow/en/">
394
+                                        http://eu.battle.net/wow/en/
395
+                                    </a>
396
+                                </div>
397
+                            </div>
398
+                        </div>
399
+                    </div>
400
+                </div>
401
+            </div>
402
+        </div>
403
+        """
404
+        urls = []
405
+        html_etree = fromstring(html)
406
+        wikidata.add_url(urls, html_etree, 'P856')
407
+        self.assertEquals(len(urls), 2)
408
+        self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
409
+        self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
410
+
411
+    def test_get_imdblink(self):
412
+        html = u"""
413
+        <div>
414
+            <div class="wikibase-statementview-mainsnak">
415
+                <div>
416
+                    <div class="wikibase-snakview-value">
417
+                        <a class="wb-external-id" href="http://www.imdb.com/tt0433664">
418
+                            tt0433664
419
+                        </a>
420
+                    </div>
421
+                </div>
422
+            </div>
423
+        </div>
424
+        """
425
+        html_etree = fromstring(html)
426
+        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
427
+
428
+        html = u"""
429
+        <div>
430
+            <div class="wikibase-statementview-mainsnak">
431
+                <div>
432
+                    <div class="wikibase-snakview-value">
433
+                        <a class="wb-external-id"
434
+                           href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994"">
435
+                            nm4915994
436
+                        </a>
437
+                    </div>
438
+                </div>
439
+            </div>
440
+        </div>
441
+        """
442
+        html_etree = fromstring(html)
443
+        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
444
+        self.assertIn('https://www.imdb.com/name/nm4915994', imdblink)
445
+
446
+    def test_get_geolink(self):
447
+        html = u"""
448
+        <div>
449
+            <div class="wikibase-statementview-mainsnak">
450
+                <div>
451
+                    <div class="wikibase-snakview-value">
452
+                        60°N, 40°E
453
+                    </div>
454
+                </div>
455
+            </div>
456
+        </div>
457
+        """
458
+        html_etree = fromstring(html)
459
+        geolink = wikidata.get_geolink(html_etree)
460
+        self.assertIn('https://www.openstreetmap.org/', geolink)
461
+        self.assertIn('lat=60&lon=40', geolink)
462
+
463
+        html = u"""
464
+        <div>
465
+            <div class="wikibase-statementview-mainsnak">
466
+                <div>
467
+                    <div class="wikibase-snakview-value">
468
+                        34°35'59"S, 58°22'55"W
469
+                    </div>
470
+                </div>
471
+            </div>
472
+        </div>
473
+        """
474
+        html_etree = fromstring(html)
475
+        geolink = wikidata.get_geolink(html_etree)
476
+        self.assertIn('https://www.openstreetmap.org/', geolink)
477
+        self.assertIn('lat=-34.59', geolink)
478
+        self.assertIn('lon=-58.38', geolink)
479
+
480
+    def test_get_wikilink(self):
481
+        html = """
482
+        <div>
483
+            <div>
484
+                <ul class="wikibase-sitelinklistview-listview">
485
+                    <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li>
486
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
487
+                </ul>
488
+            </div>
489
+            <div>
490
+                <ul class="wikibase-sitelinklistview-listview">
491
+                    <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li>
492
+                </ul>
493
+            </div>
494
+        </div>
495
+        """
496
+        html_etree = fromstring(html)
497
+        wikilink = wikidata.get_wikilink(html_etree, 'nowiki')
498
+        self.assertEqual(wikilink, None)
499
+        wikilink = wikidata.get_wikilink(html_etree, 'enwiki')
500
+        self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test')
501
+        wikilink = wikidata.get_wikilink(html_etree, 'arwiki')
502
+        self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test')
503
+        wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote')
504
+        self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test')