123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # import htmlentitydefs
  2. import locale
  3. import dateutil.parser
  4. import cStringIO
  5. import csv
  6. import os
  7. import re
  8. from codecs import getincrementalencoder
  9. from HTMLParser import HTMLParser
  10. from random import choice
  11. from searx.version import VERSION_STRING
  12. from searx import settings
  13. from searx import logger
  14. logger = logger.getChild('utils')
  15. ua_versions = ('33.0',
  16. '34.0',
  17. '35.0',
  18. '36.0',
  19. '37.0')
  20. ua_os = ('Windows NT 6.3; WOW64',
  21. 'X11; Linux x86_64',
  22. 'X11; Linux x86')
  23. ua = "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"
  24. blocked_tags = ('script',
  25. 'style')
  26. def gen_useragent():
  27. # TODO
  28. return ua.format(os=choice(ua_os), version=choice(ua_versions))
  29. def searx_useragent():
  30. return 'searx/{searx_version} {suffix}'.format(
  31. searx_version=VERSION_STRING,
  32. suffix=settings['outgoing'].get('useragent_suffix', ''))
  33. def highlight_content(content, query):
  34. if not content:
  35. return None
  36. # ignoring html contents
  37. # TODO better html content detection
  38. if content.find('<') != -1:
  39. return content
  40. query = query.decode('utf-8')
  41. if content.lower().find(query.lower()) > -1:
  42. query_regex = u'({0})'.format(re.escape(query))
  43. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  44. content, flags=re.I | re.U)
  45. else:
  46. regex_parts = []
  47. for chunk in query.split():
  48. if len(chunk) == 1:
  49. regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
  50. else:
  51. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  52. query_regex = u'({0})'.format('|'.join(regex_parts))
  53. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  54. content, flags=re.I | re.U)
  55. return content
  56. class HTMLTextExtractor(HTMLParser):
  57. def __init__(self):
  58. HTMLParser.__init__(self)
  59. self.result = []
  60. self.tags = []
  61. def handle_starttag(self, tag, attrs):
  62. self.tags.append(tag)
  63. def handle_endtag(self, tag):
  64. if not self.tags:
  65. return
  66. if tag != self.tags[-1]:
  67. raise Exception("invalid html")
  68. self.tags.pop()
  69. def is_valid_tag(self):
  70. return not self.tags or self.tags[-1] not in blocked_tags
  71. def handle_data(self, d):
  72. if not self.is_valid_tag():
  73. return
  74. self.result.append(d)
  75. def handle_charref(self, number):
  76. if not self.is_valid_tag():
  77. return
  78. if number[0] in (u'x', u'X'):
  79. codepoint = int(number[1:], 16)
  80. else:
  81. codepoint = int(number)
  82. self.result.append(unichr(codepoint))
  83. def handle_entityref(self, name):
  84. if not self.is_valid_tag():
  85. return
  86. # codepoint = htmlentitydefs.name2codepoint[name]
  87. # self.result.append(unichr(codepoint))
  88. self.result.append(name)
  89. def get_text(self):
  90. return u''.join(self.result).strip()
  91. def html_to_text(html):
  92. html = html.replace('\n', ' ')
  93. html = ' '.join(html.split())
  94. s = HTMLTextExtractor()
  95. s.feed(html)
  96. return s.get_text()
  97. class UnicodeWriter:
  98. """
  99. A CSV writer which will write rows to CSV file "f",
  100. which is encoded in the given encoding.
  101. """
  102. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  103. # Redirect output to a queue
  104. self.queue = cStringIO.StringIO()
  105. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  106. self.stream = f
  107. self.encoder = getincrementalencoder(encoding)()
  108. def writerow(self, row):
  109. unicode_row = []
  110. for col in row:
  111. if type(col) == str or type(col) == unicode:
  112. unicode_row.append(col.encode('utf-8').strip())
  113. else:
  114. unicode_row.append(col)
  115. self.writer.writerow(unicode_row)
  116. # Fetch UTF-8 output from the queue ...
  117. data = self.queue.getvalue()
  118. data = data.decode("utf-8")
  119. # ... and reencode it into the target encoding
  120. data = self.encoder.encode(data)
  121. # write to the target stream
  122. self.stream.write(data)
  123. # empty queue
  124. self.queue.truncate(0)
  125. def writerows(self, rows):
  126. for row in rows:
  127. self.writerow(row)
  128. def get_themes(root):
  129. """Returns available themes list."""
  130. static_path = os.path.join(root, 'static')
  131. templates_path = os.path.join(root, 'templates')
  132. themes = os.listdir(os.path.join(static_path, 'themes'))
  133. return static_path, templates_path, themes
  134. def get_static_files(base_path):
  135. base_path = os.path.join(base_path, 'static')
  136. static_files = set()
  137. base_path_length = len(base_path) + 1
  138. for directory, _, files in os.walk(base_path):
  139. for filename in files:
  140. f = os.path.join(directory[base_path_length:], filename)
  141. static_files.add(f)
  142. return static_files
  143. def get_result_templates(base_path):
  144. base_path = os.path.join(base_path, 'templates')
  145. result_templates = set()
  146. base_path_length = len(base_path) + 1
  147. for directory, _, files in os.walk(base_path):
  148. if directory.endswith('result_templates'):
  149. for filename in files:
  150. f = os.path.join(directory[base_path_length:], filename)
  151. result_templates.add(f)
  152. return result_templates
  153. def format_date_by_locale(date_string, locale_string):
  154. # strftime works only on dates after 1900
  155. parsed_date = dateutil.parser.parse(date_string)
  156. if parsed_date.year <= 1900:
  157. return parsed_date.isoformat().split('T')[0]
  158. orig_locale = locale.getlocale()[0]
  159. try:
  160. locale.setlocale(locale.LC_ALL, locale_string)
  161. except:
  162. logger.warning('cannot set locale: {0}'.format(locale_string))
  163. formatted_date = parsed_date.strftime(locale.nl_langinfo(locale.D_FMT))
  164. try:
  165. locale.setlocale(locale.LC_ALL, orig_locale)
  166. except:
  167. logger.warning('cannot set original locale: {0}'.format(orig_locale))
  168. return formatted_date
  169. def dict_subset(d, properties):
  170. result = {}
  171. for k in properties:
  172. if k in d:
  173. result[k] = d[k]
  174. return result
  175. def prettify_url(url):
  176. if len(url) > 74:
  177. return u'{0}[...]{1}'.format(url[:35], url[-35:])
  178. else:
  179. return url
  180. # get element in list or default value
  181. def list_get(a_list, index, default=None):
  182. if len(a_list) > index:
  183. return a_list[index]
  184. else:
  185. return default
  186. def get_blocked_engines(engines, cookies):
  187. if 'blocked_engines' not in cookies:
  188. return [(engine_name, category) for engine_name in engines
  189. for category in engines[engine_name].categories if engines[engine_name].disabled]
  190. blocked_engine_strings = cookies.get('blocked_engines', '').split(',')
  191. blocked_engines = []
  192. if not blocked_engine_strings:
  193. return blocked_engines
  194. for engine_string in blocked_engine_strings:
  195. if engine_string.find('__') > -1:
  196. engine, category = engine_string.split('__', 1)
  197. if engine in engines and category in engines[engine].categories:
  198. blocked_engines.append((engine, category))
  199. elif engine_string in engines:
  200. for category in engines[engine_string].categories:
  201. blocked_engines.append((engine_string, category))
  202. return blocked_engines