base.py 3.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python
  2. """
  3. BASE (Scholar publications)
  4. @website https://base-search.net
  5. @provide-api yes with authorization (https://api.base-search.net/)
  6. @using-api yes
  7. @results XML
  8. @stable ?
  9. @parse url, title, publishedDate, content
  10. More info on api: http://base-search.net/about/download/base_interface.pdf
  11. """
  12. from lxml import etree
  13. from datetime import datetime
  14. import re
  15. from searx.url_utils import urlencode
  16. from searx.utils import searx_useragent
  17. categories = ['science']
  18. base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\
  19. + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}'
  20. # engine dependent config
  21. paging = True
  22. number_of_results = 10
  23. # shortcuts for advanced search
  24. shorcut_dict = {
  25. # user-friendly keywords
  26. 'format:': 'dcformat:',
  27. 'author:': 'dccreator:',
  28. 'collection:': 'dccollection:',
  29. 'hdate:': 'dchdate:',
  30. 'contributor:': 'dccontributor:',
  31. 'coverage:': 'dccoverage:',
  32. 'date:': 'dcdate:',
  33. 'abstract:': 'dcdescription:',
  34. 'urls:': 'dcidentifier:',
  35. 'language:': 'dclanguage:',
  36. 'publisher:': 'dcpublisher:',
  37. 'relation:': 'dcrelation:',
  38. 'rights:': 'dcrights:',
  39. 'source:': 'dcsource:',
  40. 'subject:': 'dcsubject:',
  41. 'title:': 'dctitle:',
  42. 'type:': 'dcdctype:'
  43. }
  44. def request(query, params):
  45. # replace shortcuts with API advanced search keywords
  46. for key in shorcut_dict.keys():
  47. query = re.sub(str(key), str(shorcut_dict[key]), query)
  48. # basic search
  49. offset = (params['pageno'] - 1) * number_of_results
  50. string_args = dict(query=urlencode({'query': query}),
  51. offset=offset,
  52. hits=number_of_results)
  53. params['url'] = base_url.format(**string_args)
  54. params['headers']['User-Agent'] = searx_useragent()
  55. return params
  56. def response(resp):
  57. results = []
  58. search_results = etree.XML(resp.content)
  59. for entry in search_results.xpath('./result/doc'):
  60. content = "No description available"
  61. date = datetime.now() # needed in case no dcdate is available for an item
  62. for item in entry:
  63. if item.attrib["name"] == "dchdate":
  64. harvestDate = item.text
  65. elif item.attrib["name"] == "dcdate":
  66. date = item.text
  67. elif item.attrib["name"] == "dctitle":
  68. title = item.text
  69. elif item.attrib["name"] == "dclink":
  70. url = item.text
  71. elif item.attrib["name"] == "dcdescription":
  72. content = item.text[:300]
  73. if len(item.text) > 300:
  74. content += "..."
  75. # dates returned by the BASE API are not several formats
  76. publishedDate = None
  77. for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']:
  78. try:
  79. publishedDate = datetime.strptime(date, date_format)
  80. break
  81. except:
  82. pass
  83. if publishedDate is not None:
  84. res_dict = {'url': url,
  85. 'title': title,
  86. 'publishedDate': publishedDate,
  87. 'content': content}
  88. else:
  89. res_dict = {'url': url,
  90. 'title': title,
  91. 'content': content}
  92. results.append(res_dict)
  93. return results