You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

147 lines
6.3 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from xml.sax.saxutils import escape, unescape
  4. from html5lib.constants import tokenTypes
  5. from html5lib.sanitizer import HTMLSanitizerMixin
  6. from html5lib.tokenizer import HTMLTokenizer
  7. PROTOS = HTMLSanitizerMixin.acceptable_protocols
  8. PROTOS.remove('feed')
  9. class BleachSanitizerMixin(HTMLSanitizerMixin):
  10. """Mixin to replace sanitize_token() and sanitize_css()."""
  11. allowed_svg_properties = []
  12. def sanitize_token(self, token):
  13. """Sanitize a token either by HTML-encoding or dropping.
  14. Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
  15. a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
  16. Here callable is a function with two arguments of attribute name
  17. and value. It should return true of false.
  18. Also gives the option to strip tags instead of encoding.
  19. """
  20. if (getattr(self, 'wildcard_attributes', None) is None and
  21. isinstance(self.allowed_attributes, dict)):
  22. self.wildcard_attributes = self.allowed_attributes.get('*', [])
  23. if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
  24. tokenTypes['EmptyTag']):
  25. if token['name'] in self.allowed_elements:
  26. if 'data' in token:
  27. if isinstance(self.allowed_attributes, dict):
  28. allowed_attributes = self.allowed_attributes.get(
  29. token['name'], [])
  30. if not callable(allowed_attributes):
  31. allowed_attributes += self.wildcard_attributes
  32. else:
  33. allowed_attributes = self.allowed_attributes
  34. attrs = dict([(name, val) for name, val in
  35. token['data'][::-1]
  36. if (allowed_attributes(name, val)
  37. if callable(allowed_attributes)
  38. else name in allowed_attributes)])
  39. for attr in self.attr_val_is_uri:
  40. if attr not in attrs:
  41. continue
  42. val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
  43. unescape(attrs[attr])).lower()
  44. # Remove replacement characters from unescaped
  45. # characters.
  46. val_unescaped = val_unescaped.replace("\ufffd", "")
  47. if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
  48. and (val_unescaped.split(':')[0] not in
  49. self.allowed_protocols)):
  50. del attrs[attr]
  51. for attr in self.svg_attr_val_allows_ref:
  52. if attr in attrs:
  53. attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
  54. ' ',
  55. unescape(attrs[attr]))
  56. if (token['name'] in self.svg_allow_local_href and
  57. 'xlink:href' in attrs and
  58. re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
  59. del attrs['xlink:href']
  60. if 'style' in attrs:
  61. attrs['style'] = self.sanitize_css(attrs['style'])
  62. token['data'] = [(name, val) for name, val in
  63. attrs.items()]
  64. return token
  65. elif self.strip_disallowed_elements:
  66. pass
  67. else:
  68. if token['type'] == tokenTypes['EndTag']:
  69. token['data'] = '</{0!s}>'.format(token['name'])
  70. elif token['data']:
  71. attr = ' {0!s}="{1!s}"'
  72. attrs = ''.join([attr.format(k, escape(v)) for k, v in
  73. token['data']])
  74. token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
  75. else:
  76. token['data'] = '<{0!s}>'.format(token['name'])
  77. if token['selfClosing']:
  78. token['data'] = token['data'][:-1] + '/>'
  79. token['type'] = tokenTypes['Characters']
  80. del token["name"]
  81. return token
  82. elif token['type'] == tokenTypes['Comment']:
  83. if not self.strip_html_comments:
  84. return token
  85. else:
  86. return token
  87. def sanitize_css(self, style):
  88. """HTMLSanitizerMixin.sanitize_css replacement.
  89. HTMLSanitizerMixin.sanitize_css always whitelists background-*,
  90. border-*, margin-*, and padding-*. We only whitelist what's in
  91. the whitelist.
  92. """
  93. # disallow urls
  94. style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  95. # gauntlet
  96. # TODO: Make sure this does what it's meant to - I *think* it wants to
  97. # validate style attribute contents.
  98. parts = style.split(';')
  99. gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
  100. """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
  101. for part in parts:
  102. if not gauntlet.match(part):
  103. return ''
  104. if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  105. return ''
  106. clean = []
  107. for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
  108. if not value:
  109. continue
  110. if prop.lower() in self.allowed_css_properties:
  111. clean.append(prop + ': ' + value + ';')
  112. elif prop.lower() in self.allowed_svg_properties:
  113. clean.append(prop + ': ' + value + ';')
  114. return ' '.join(clean)
  115. class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
  116. def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
  117. lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
  118. HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
  119. lowercaseElementName, lowercaseAttrName,
  120. **kwargs)
  121. def __iter__(self):
  122. for token in HTMLTokenizer.__iter__(self):
  123. token = self.sanitize_token(token)
  124. if token:
  125. yield token