|
|
- from __future__ import unicode_literals
- import re
- from xml.sax.saxutils import escape, unescape
-
- from html5lib.constants import tokenTypes
- from html5lib.sanitizer import HTMLSanitizerMixin
- from html5lib.tokenizer import HTMLTokenizer
-
-
- PROTOS = HTMLSanitizerMixin.acceptable_protocols
- PROTOS.remove('feed')
-
-
- class BleachSanitizerMixin(HTMLSanitizerMixin):
- """Mixin to replace sanitize_token() and sanitize_css()."""
-
- allowed_svg_properties = []
-
- def sanitize_token(self, token):
- """Sanitize a token either by HTML-encoding or dropping.
-
- Unlike HTMLSanitizerMixin.sanitize_token, allowed_attributes can be
- a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
-
- Here callable is a function with two arguments of attribute name
- and value. It should return true of false.
-
- Also gives the option to strip tags instead of encoding.
-
- """
- if (getattr(self, 'wildcard_attributes', None) is None and
- isinstance(self.allowed_attributes, dict)):
- self.wildcard_attributes = self.allowed_attributes.get('*', [])
-
- if token['type'] in (tokenTypes['StartTag'], tokenTypes['EndTag'],
- tokenTypes['EmptyTag']):
- if token['name'] in self.allowed_elements:
- if 'data' in token:
- if isinstance(self.allowed_attributes, dict):
- allowed_attributes = self.allowed_attributes.get(
- token['name'], [])
- if not callable(allowed_attributes):
- allowed_attributes += self.wildcard_attributes
- else:
- allowed_attributes = self.allowed_attributes
- attrs = dict([(name, val) for name, val in
- token['data'][::-1]
- if (allowed_attributes(name, val)
- if callable(allowed_attributes)
- else name in allowed_attributes)])
- for attr in self.attr_val_is_uri:
- if attr not in attrs:
- continue
- val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
- unescape(attrs[attr])).lower()
- # Remove replacement characters from unescaped
- # characters.
- val_unescaped = val_unescaped.replace("\ufffd", "")
- if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped)
- and (val_unescaped.split(':')[0] not in
- self.allowed_protocols)):
- del attrs[attr]
- for attr in self.svg_attr_val_allows_ref:
- if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
- if (token['name'] in self.svg_allow_local_href and
- 'xlink:href' in attrs and
- re.search(r'^\s*[^#\s].*', attrs['xlink:href'])):
- del attrs['xlink:href']
- if 'style' in attrs:
- attrs['style'] = self.sanitize_css(attrs['style'])
- token['data'] = [(name, val) for name, val in
- attrs.items()]
- return token
- elif self.strip_disallowed_elements:
- pass
- else:
- if token['type'] == tokenTypes['EndTag']:
- token['data'] = '</{0!s}>'.format(token['name'])
- elif token['data']:
- attr = ' {0!s}="{1!s}"'
- attrs = ''.join([attr.format(k, escape(v)) for k, v in
- token['data']])
- token['data'] = '<{0!s}{1!s}>'.format(token['name'], attrs)
- else:
- token['data'] = '<{0!s}>'.format(token['name'])
- if token['selfClosing']:
- token['data'] = token['data'][:-1] + '/>'
- token['type'] = tokenTypes['Characters']
- del token["name"]
- return token
- elif token['type'] == tokenTypes['Comment']:
- if not self.strip_html_comments:
- return token
- else:
- return token
-
- def sanitize_css(self, style):
- """HTMLSanitizerMixin.sanitize_css replacement.
-
- HTMLSanitizerMixin.sanitize_css always whitelists background-*,
- border-*, margin-*, and padding-*. We only whitelist what's in
- the whitelist.
-
- """
- # disallow urls
- style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
- # gauntlet
- # TODO: Make sure this does what it's meant to - I *think* it wants to
- # validate style attribute contents.
- parts = style.split(';')
- gauntlet = re.compile("""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'"""
- """\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$""")
- for part in parts:
- if not gauntlet.match(part):
- return ''
-
- if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
- return ''
-
- clean = []
- for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
- if not value:
- continue
- if prop.lower() in self.allowed_css_properties:
- clean.append(prop + ': ' + value + ';')
- elif prop.lower() in self.allowed_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
-
-
- class BleachSanitizer(HTMLTokenizer, BleachSanitizerMixin):
- def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
- lowercaseElementName=True, lowercaseAttrName=True, **kwargs):
- HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
- lowercaseElementName, lowercaseAttrName,
- **kwargs)
-
- def __iter__(self):
- for token in HTMLTokenizer.__iter__(self):
- token = self.sanitize_token(token)
- if token:
- yield token
|