You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

606 lines
20 KiB

4 years ago
  1. from __future__ import unicode_literals
  2. from itertools import chain
  3. import re
  4. import six
  5. from six.moves.urllib.parse import urlparse
  6. from xml.sax.saxutils import unescape
  7. from bleach import html5lib_shim
  8. from bleach.utils import alphabetize_attributes, force_unicode
  9. #: List of allowed tags
  10. ALLOWED_TAGS = [
  11. 'a',
  12. 'abbr',
  13. 'acronym',
  14. 'b',
  15. 'blockquote',
  16. 'code',
  17. 'em',
  18. 'i',
  19. 'li',
  20. 'ol',
  21. 'strong',
  22. 'ul',
  23. ]
  24. #: Map of allowed attributes by tag
  25. ALLOWED_ATTRIBUTES = {
  26. 'a': ['href', 'title'],
  27. 'abbr': ['title'],
  28. 'acronym': ['title'],
  29. }
  30. #: List of allowed styles
  31. ALLOWED_STYLES = []
  32. #: List of allowed protocols
  33. ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
  34. #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
  35. INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
  36. #: Regexp for characters that are invisible
  37. INVISIBLE_CHARACTERS_RE = re.compile(
  38. '[' + INVISIBLE_CHARACTERS + ']',
  39. re.UNICODE
  40. )
  41. #: String to replace invisible characters with. This can be a character, a
  42. #: string, or even a function that takes a Python re matchobj
  43. INVISIBLE_REPLACEMENT_CHAR = '?'
  44. class Cleaner(object):
  45. """Cleaner for cleaning HTML fragments of malicious content
  46. This cleaner is a security-focused function whose sole purpose is to remove
  47. malicious content from a string such that it can be displayed as content in
  48. a web page.
  49. To use::
  50. from bleach.sanitizer import Cleaner
  51. cleaner = Cleaner()
  52. for text in all_the_yucky_things:
  53. sanitized = cleaner.clean(text)
  54. .. Note::
  55. This cleaner is not designed to use to transform content to be used in
  56. non-web-page contexts.
  57. .. Warning::
  58. This cleaner is not thread-safe--the html parser has internal state.
  59. Create a separate cleaner per thread!
  60. """
  61. def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
  62. styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
  63. strip_comments=True, filters=None):
  64. """Initializes a Cleaner
  65. :arg list tags: allowed list of tags; defaults to
  66. ``bleach.sanitizer.ALLOWED_TAGS``
  67. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  68. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  69. :arg list styles: allowed list of css styles; defaults to
  70. ``bleach.sanitizer.ALLOWED_STYLES``
  71. :arg list protocols: allowed list of protocols for links; defaults
  72. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  73. :arg bool strip: whether or not to strip disallowed elements
  74. :arg bool strip_comments: whether or not to strip HTML comments
  75. :arg list filters: list of html5lib Filter classes to pass streamed content through
  76. .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
  77. .. Warning::
  78. Using filters changes the output of ``bleach.Cleaner.clean``.
  79. Make sure the way the filters change the output are secure.
  80. """
  81. self.tags = tags
  82. self.attributes = attributes
  83. self.styles = styles
  84. self.protocols = protocols
  85. self.strip = strip
  86. self.strip_comments = strip_comments
  87. self.filters = filters or []
  88. self.parser = html5lib_shim.BleachHTMLParser(
  89. tags=self.tags,
  90. strip=self.strip,
  91. consume_entities=False,
  92. namespaceHTMLElements=False
  93. )
  94. self.walker = html5lib_shim.getTreeWalker('etree')
  95. self.serializer = html5lib_shim.BleachHTMLSerializer(
  96. quote_attr_values='always',
  97. omit_optional_tags=False,
  98. escape_lt_in_attrs=True,
  99. # We want to leave entities as they are without escaping or
  100. # resolving or expanding
  101. resolve_entities=False,
  102. # Bleach has its own sanitizer, so don't use the html5lib one
  103. sanitize=False,
  104. # Bleach sanitizer alphabetizes already, so don't use the html5lib one
  105. alphabetical_attributes=False,
  106. )
  107. def clean(self, text):
  108. """Cleans text and returns sanitized result as unicode
  109. :arg str text: text to be cleaned
  110. :returns: sanitized text as unicode
  111. :raises TypeError: if ``text`` is not a text type
  112. """
  113. if not isinstance(text, six.string_types):
  114. message = "argument cannot be of '{name}' type, must be of text type".format(
  115. name=text.__class__.__name__)
  116. raise TypeError(message)
  117. if not text:
  118. return u''
  119. text = force_unicode(text)
  120. dom = self.parser.parseFragment(text)
  121. filtered = BleachSanitizerFilter(
  122. source=self.walker(dom),
  123. # Bleach-sanitizer-specific things
  124. attributes=self.attributes,
  125. strip_disallowed_elements=self.strip,
  126. strip_html_comments=self.strip_comments,
  127. # html5lib-sanitizer things
  128. allowed_elements=self.tags,
  129. allowed_css_properties=self.styles,
  130. allowed_protocols=self.protocols,
  131. allowed_svg_properties=[],
  132. )
  133. # Apply any filters after the BleachSanitizerFilter
  134. for filter_class in self.filters:
  135. filtered = filter_class(source=filtered)
  136. return self.serializer.render(filtered)
  137. def attribute_filter_factory(attributes):
  138. """Generates attribute filter function for the given attributes value
  139. The attributes value can take one of several shapes. This returns a filter
  140. function appropriate to the attributes value. One nice thing about this is
  141. that there's less if/then shenanigans in the ``allow_token`` method.
  142. """
  143. if callable(attributes):
  144. return attributes
  145. if isinstance(attributes, dict):
  146. def _attr_filter(tag, attr, value):
  147. if tag in attributes:
  148. attr_val = attributes[tag]
  149. if callable(attr_val):
  150. return attr_val(tag, attr, value)
  151. if attr in attr_val:
  152. return True
  153. if '*' in attributes:
  154. attr_val = attributes['*']
  155. if callable(attr_val):
  156. return attr_val(tag, attr, value)
  157. return attr in attr_val
  158. return False
  159. return _attr_filter
  160. if isinstance(attributes, list):
  161. def _attr_filter(tag, attr, value):
  162. return attr in attributes
  163. return _attr_filter
  164. raise ValueError('attributes needs to be a callable, a list or a dict')
  165. class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
  166. """html5lib Filter that sanitizes text
  167. This filter can be used anywhere html5lib filters can be used.
  168. """
  169. def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
  170. strip_disallowed_elements=False, strip_html_comments=True,
  171. **kwargs):
  172. """Creates a BleachSanitizerFilter instance
  173. :arg Treewalker source: stream
  174. :arg list tags: allowed list of tags; defaults to
  175. ``bleach.sanitizer.ALLOWED_TAGS``
  176. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  177. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  178. :arg list styles: allowed list of css styles; defaults to
  179. ``bleach.sanitizer.ALLOWED_STYLES``
  180. :arg list protocols: allowed list of protocols for links; defaults
  181. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  182. :arg bool strip_disallowed_elements: whether or not to strip disallowed
  183. elements
  184. :arg bool strip_html_comments: whether or not to strip HTML comments
  185. """
  186. self.attr_filter = attribute_filter_factory(attributes)
  187. self.strip_disallowed_elements = strip_disallowed_elements
  188. self.strip_html_comments = strip_html_comments
  189. return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
  190. def sanitize_stream(self, token_iterator):
  191. for token in token_iterator:
  192. ret = self.sanitize_token(token)
  193. if not ret:
  194. continue
  195. if isinstance(ret, list):
  196. for subtoken in ret:
  197. yield subtoken
  198. else:
  199. yield ret
  200. def merge_characters(self, token_iterator):
  201. """Merge consecutive Characters tokens in a stream"""
  202. characters_buffer = []
  203. for token in token_iterator:
  204. if characters_buffer:
  205. if token['type'] == 'Characters':
  206. characters_buffer.append(token)
  207. continue
  208. else:
  209. # Merge all the characters tokens together into one and then
  210. # operate on it.
  211. new_token = {
  212. 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
  213. 'type': 'Characters'
  214. }
  215. characters_buffer = []
  216. yield new_token
  217. elif token['type'] == 'Characters':
  218. characters_buffer.append(token)
  219. continue
  220. yield token
  221. new_token = {
  222. 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
  223. 'type': 'Characters'
  224. }
  225. yield new_token
  226. def __iter__(self):
  227. return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
  228. def sanitize_token(self, token):
  229. """Sanitize a token either by HTML-encoding or dropping.
  230. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
  231. ['attribute', 'pairs'], 'tag': callable}.
  232. Here callable is a function with two arguments of attribute name and
  233. value. It should return true of false.
  234. Also gives the option to strip tags instead of encoding.
  235. :arg dict token: token to sanitize
  236. :returns: token or list of tokens
  237. """
  238. token_type = token['type']
  239. if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
  240. if token['name'] in self.allowed_elements:
  241. return self.allow_token(token)
  242. elif self.strip_disallowed_elements:
  243. return None
  244. else:
  245. if 'data' in token:
  246. # Alphabetize the attributes before calling .disallowed_token()
  247. # so that the resulting string is stable
  248. token['data'] = alphabetize_attributes(token['data'])
  249. return self.disallowed_token(token)
  250. elif token_type == 'Comment':
  251. if not self.strip_html_comments:
  252. return token
  253. else:
  254. return None
  255. elif token_type == 'Characters':
  256. return self.sanitize_characters(token)
  257. else:
  258. return token
  259. def sanitize_characters(self, token):
  260. """Handles Characters tokens
  261. Our overridden tokenizer doesn't do anything with entities. However,
  262. that means that the serializer will convert all ``&`` in Characters
  263. tokens to ``&``.
  264. Since we don't want that, we extract entities here and convert them to
  265. Entity tokens so the serializer will let them be.
  266. :arg token: the Characters token to work on
  267. :returns: a list of tokens
  268. """
  269. data = token.get('data', '')
  270. if not data:
  271. return token
  272. data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
  273. token['data'] = data
  274. # If there isn't a & in the data, we can return now
  275. if '&' not in data:
  276. return token
  277. new_tokens = []
  278. # For each possible entity that starts with a "&", we try to extract an
  279. # actual entity and re-tokenize accordingly
  280. for part in html5lib_shim.next_possible_entity(data):
  281. if not part:
  282. continue
  283. if part.startswith('&'):
  284. entity = html5lib_shim.match_entity(part)
  285. if entity is not None:
  286. new_tokens.append({'type': 'Entity', 'name': entity})
  287. # Length of the entity plus 2--one for & at the beginning
  288. # and and one for ; at the end
  289. remainder = part[len(entity) + 2:]
  290. if remainder:
  291. new_tokens.append({'type': 'Characters', 'data': remainder})
  292. continue
  293. new_tokens.append({'type': 'Characters', 'data': part})
  294. return new_tokens
  295. def sanitize_uri_value(self, value, allowed_protocols):
  296. """Checks a uri value to see if it's allowed
  297. :arg value: the uri value to sanitize
  298. :arg allowed_protocols: list of allowed protocols
  299. :returns: allowed value or None
  300. """
  301. # NOTE(willkg): This transforms the value into one that's easier to
  302. # match and verify, but shouldn't get returned since it's vastly
  303. # different than the original value.
  304. # Convert all character entities in the value
  305. new_value = html5lib_shim.convert_entities(value)
  306. # Nix backtick, space characters, and control characters
  307. new_value = re.sub(
  308. "[`\000-\040\177-\240\s]+",
  309. '',
  310. new_value
  311. )
  312. # Remove REPLACEMENT characters
  313. new_value = new_value.replace('\ufffd', '')
  314. # Lowercase it--this breaks the value, but makes it easier to match
  315. # against
  316. new_value = new_value.lower()
  317. try:
  318. # Drop attributes with uri values that have protocols that aren't
  319. # allowed
  320. parsed = urlparse(new_value)
  321. except ValueError:
  322. # URI is impossible to parse, therefore it's not allowed
  323. return None
  324. if parsed.scheme:
  325. # If urlparse found a scheme, check that
  326. if parsed.scheme in allowed_protocols:
  327. return value
  328. else:
  329. # Allow uris that are just an anchor
  330. if new_value.startswith('#'):
  331. return value
  332. # Handle protocols that urlparse doesn't recognize like "myprotocol"
  333. if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
  334. return value
  335. # If there's no protocol/scheme specified, then assume it's "http"
  336. # and see if that's allowed
  337. if 'http' in allowed_protocols:
  338. return value
  339. return None
  340. def allow_token(self, token):
  341. """Handles the case where we're allowing the tag"""
  342. if 'data' in token:
  343. # Loop through all the attributes and drop the ones that are not
  344. # allowed, are unsafe or break other rules. Additionally, fix
  345. # attribute values that need fixing.
  346. #
  347. # At the end of this loop, we have the final set of attributes
  348. # we're keeping.
  349. attrs = {}
  350. for namespaced_name, val in token['data'].items():
  351. namespace, name = namespaced_name
  352. # Drop attributes that are not explicitly allowed
  353. #
  354. # NOTE(willkg): We pass in the attribute name--not a namespaced
  355. # name.
  356. if not self.attr_filter(token['name'], name, val):
  357. continue
  358. # Drop attributes with uri values that use a disallowed protocol
  359. # Sanitize attributes with uri values
  360. if namespaced_name in self.attr_val_is_uri:
  361. new_value = self.sanitize_uri_value(val, self.allowed_protocols)
  362. if new_value is None:
  363. continue
  364. val = new_value
  365. # Drop values in svg attrs with non-local IRIs
  366. if namespaced_name in self.svg_attr_val_allows_ref:
  367. new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
  368. ' ',
  369. unescape(val))
  370. new_val = new_val.strip()
  371. if not new_val:
  372. continue
  373. else:
  374. # Replace the val with the unescaped version because
  375. # it's a iri
  376. val = new_val
  377. # Drop href and xlink:href attr for svg elements with non-local IRIs
  378. if (None, token['name']) in self.svg_allow_local_href:
  379. if namespaced_name in [
  380. (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
  381. ]:
  382. if re.search(r'^\s*[^#\s]', val):
  383. continue
  384. # If it's a style attribute, sanitize it
  385. if namespaced_name == (None, u'style'):
  386. val = self.sanitize_css(val)
  387. # At this point, we want to keep the attribute, so add it in
  388. attrs[namespaced_name] = val
  389. token['data'] = alphabetize_attributes(attrs)
  390. return token
  391. def disallowed_token(self, token):
  392. token_type = token["type"]
  393. if token_type == "EndTag":
  394. token["data"] = "</%s>" % token["name"]
  395. elif token["data"]:
  396. assert token_type in ("StartTag", "EmptyTag")
  397. attrs = []
  398. for (ns, name), v in token["data"].items():
  399. # If we end up with a namespace, but no name, switch them so we
  400. # have a valid name to use.
  401. if ns and not name:
  402. ns, name = name, ns
  403. # Figure out namespaced name if the namespace is appropriate
  404. # and exists; if the ns isn't in prefixes, then drop it.
  405. if ns is None or ns not in html5lib_shim.prefixes:
  406. namespaced_name = name
  407. else:
  408. namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
  409. attrs.append(' %s="%s"' % (
  410. namespaced_name,
  411. # NOTE(willkg): HTMLSerializer escapes attribute values
  412. # already, so if we do it here (like HTMLSerializer does),
  413. # then we end up double-escaping.
  414. v)
  415. )
  416. token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
  417. else:
  418. token["data"] = "<%s>" % token["name"]
  419. if token.get("selfClosing"):
  420. token["data"] = token["data"][:-1] + "/>"
  421. token["type"] = "Characters"
  422. del token["name"]
  423. return token
  424. def sanitize_css(self, style):
  425. """Sanitizes css in style tags"""
  426. # Convert entities in the style so that it can be parsed as CSS
  427. style = html5lib_shim.convert_entities(style)
  428. # Drop any url values before we do anything else
  429. style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  430. # The gauntlet of sanitization
  431. # Validate the css in the style tag and if it's not valid, then drop
  432. # the whole thing.
  433. parts = style.split(';')
  434. gauntlet = re.compile(
  435. r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
  436. )
  437. for part in parts:
  438. if not gauntlet.match(part):
  439. return ''
  440. if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  441. return ''
  442. clean = []
  443. for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
  444. if not value:
  445. continue
  446. if prop.lower() in self.allowed_css_properties:
  447. clean.append(prop + ': ' + value + ';')
  448. elif prop.lower() in self.allowed_svg_properties:
  449. clean.append(prop + ': ' + value + ';')
  450. return ' '.join(clean)