from __future__ import unicode_literals import re from xml.sax.saxutils import escape, unescape from html5lib.constants import tokenTypes from html5lib.sanitizer import HTMLSanitizerMixin from html5lib.tokenizer import HTMLTokenizer PROTOS = HTMLSanitizerMixin.acceptable_protocols PROTOS.remove('feed') class BleachSanitizerMixin(HTMLSanitizerMixin): """Mixin to replace sanitize_token() and sanitize_css().""" allowed_svg_properties = [] def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. Here callable is a function with two arguments of attribute name and value. It should return true of false. Also gives the option to strip tags instead of encoding. """ if (getattr(self, 'wildcard_attributes', None) is None and isinstance(self.allowed_attributes, dict)): self.wildcard_attributes = self.allowed_attributes.get('*', []) if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'], tokenTypes['EmptyTag']): if token['name'] in self.allowed_elements: if 'data' in token: if isinstance(self.allowed_attributes, dict): allowed_attributes = self.allowed_attributes.get( token['name'], []) if not callable(allowed_attributes): allowed_attributes += self.wildcard_attributes else: allowed_attributes = self.allowed_attributes attrs = dict([(name, val) for name, val in token['data'][::-1] if (allowed_attributes(name, val) if callable(allowed_attributes) else name in allowed_attributes)]) for attr in self.attr_val_is_uri: if attr not in attrs: continue val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() # Remove replacement characters from unescaped # characters. val_unescaped = val_unescaped.replace("\ufffd", "") if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(attrs[attr])) if (token['name'] in self.svg_allow_local_href and 'xlink:href' in attrs and re.search(r'^\s*[^#\s].*', attrs['xlink:href'])): del attrs['xlink:href'] if 'style' in attrs: attrs['style'] = self.sanitize_css(attrs['style']) token['data'] = [(name, val) for name, val in attrs.items()] return token elif self.strip_disallowed_elements: pass else: if token['type'] == tokenTypes['EndTag']: token['data'] = ''.format(token['name']) elif token['data']: attr = ' {0!s}="{1!s}"' attrs = ''.join([attr.format(k, escape(v)) for k, v in token['data']]) token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs) else: token['data'] = '<{0!s}>'.format(token['name']) if token['selfClosing']: token['data'] = token['data'][:-1] + '/>' token['type'] = tokenTypes['Characters'] del token["name"] return token elif token['type'] == tokenTypes['Comment']: if not self.strip_html_comments: return token else: return token def sanitize_css(self, style): """HTMLSanitizerMixin.sanitize_css replacement. HTMLSanitizerMixin.sanitize_css always whitelists background-*, border-*, margin-*, and padding-*. We only whitelist what's in the whitelist. """ # disallow urls style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet # TODO: Make sure this does what it's meant to - I *think* it wants to # validate style attribute contents. parts = style.split(';') gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'""" """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""") for part in parts: if not gauntlet.match(part): return '' if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') elif prop.lower() in self.allowed_svg_properties: clean.append(prop + ': ' + value + ';') return ' '.join(clean) class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin): def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=True, lowercaseAttrName=True, **kwargs): HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, lowercaseElementName, lowercaseAttrName, **kwargs) def __iter__(self): for token in HTMLTokenizer.__iter__(self): token = self.sanitize_token(token) if token: yield token