|
|
- from __future__ import unicode_literals
-
- from itertools import chain
- import re
-
- import six
- from six.moves.urllib.parse import urlparse
- from xml.sax.saxutils import unescape
-
- from bleach import html5lib_shim
- from bleach.utils import alphabetize_attributes, force_unicode
-
-
- #: List of allowed tags
- ALLOWED_TAGS = [
- 'a',
- 'abbr',
- 'acronym',
- 'b',
- 'blockquote',
- 'code',
- 'em',
- 'i',
- 'li',
- 'ol',
- 'strong',
- 'ul',
- ]
-
-
- #: Map of allowed attributes by tag
- ALLOWED_ATTRIBUTES = {
- 'a': ['href', 'title'],
- 'abbr': ['title'],
- 'acronym': ['title'],
- }
-
- #: List of allowed styles
- ALLOWED_STYLES = []
-
- #: List of allowed protocols
- ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
-
- #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
- INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
-
- #: Regexp for characters that are invisible
- INVISIBLE_CHARACTERS_RE = re.compile(
- '[' + INVISIBLE_CHARACTERS + ']',
- re.UNICODE
- )
-
- #: String to replace invisible characters with. This can be a character, a
- #: string, or even a function that takes a Python re matchobj
- INVISIBLE_REPLACEMENT_CHAR = '?'
-
-
- class Cleaner(object):
- """Cleaner for cleaning HTML fragments of malicious content
-
- This cleaner is a security-focused function whose sole purpose is to remove
- malicious content from a string such that it can be displayed as content in
- a web page.
-
- To use::
-
- from bleach.sanitizer import Cleaner
-
- cleaner = Cleaner()
-
- for text in all_the_yucky_things:
- sanitized = cleaner.clean(text)
-
- .. Note::
-
- This cleaner is not designed to use to transform content to be used in
- non-web-page contexts.
-
- .. Warning::
-
- This cleaner is not thread-safe--the html parser has internal state.
- Create a separate cleaner per thread!
-
-
- """
-
- def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
- styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
- strip_comments=True, filters=None):
- """Initializes a Cleaner
-
- :arg list tags: allowed list of tags; defaults to
- ``bleach.sanitizer.ALLOWED_TAGS``
-
- :arg dict attributes: allowed attributes; can be a callable, list or dict;
- defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-
- :arg list styles: allowed list of css styles; defaults to
- ``bleach.sanitizer.ALLOWED_STYLES``
-
- :arg list protocols: allowed list of protocols for links; defaults
- to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-
- :arg bool strip: whether or not to strip disallowed elements
-
- :arg bool strip_comments: whether or not to strip HTML comments
-
- :arg list filters: list of html5lib Filter classes to pass streamed content through
-
- .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
-
- .. Warning::
-
- Using filters changes the output of ``bleach.Cleaner.clean``.
- Make sure the way the filters change the output are secure.
-
- """
- self.tags = tags
- self.attributes = attributes
- self.styles = styles
- self.protocols = protocols
- self.strip = strip
- self.strip_comments = strip_comments
- self.filters = filters or []
-
- self.parser = html5lib_shim.BleachHTMLParser(
- tags=self.tags,
- strip=self.strip,
- consume_entities=False,
- namespaceHTMLElements=False
- )
- self.walker = html5lib_shim.getTreeWalker('etree')
- self.serializer = html5lib_shim.BleachHTMLSerializer(
- quote_attr_values='always',
- omit_optional_tags=False,
- escape_lt_in_attrs=True,
-
- # We want to leave entities as they are without escaping or
- # resolving or expanding
- resolve_entities=False,
-
- # Bleach has its own sanitizer, so don't use the html5lib one
- sanitize=False,
-
- # Bleach sanitizer alphabetizes already, so don't use the html5lib one
- alphabetical_attributes=False,
- )
-
- def clean(self, text):
- """Cleans text and returns sanitized result as unicode
-
- :arg str text: text to be cleaned
-
- :returns: sanitized text as unicode
-
- :raises TypeError: if ``text`` is not a text type
-
- """
- if not isinstance(text, six.string_types):
- message = "argument cannot be of '{name}' type, must be of text type".format(
- name=text.__class__.__name__)
- raise TypeError(message)
-
- if not text:
- return u''
-
- text = force_unicode(text)
-
- dom = self.parser.parseFragment(text)
- filtered = BleachSanitizerFilter(
- source=self.walker(dom),
-
- # Bleach-sanitizer-specific things
- attributes=self.attributes,
- strip_disallowed_elements=self.strip,
- strip_html_comments=self.strip_comments,
-
- # html5lib-sanitizer things
- allowed_elements=self.tags,
- allowed_css_properties=self.styles,
- allowed_protocols=self.protocols,
- allowed_svg_properties=[],
- )
-
- # Apply any filters after the BleachSanitizerFilter
- for filter_class in self.filters:
- filtered = filter_class(source=filtered)
-
- return self.serializer.render(filtered)
-
-
- def attribute_filter_factory(attributes):
- """Generates attribute filter function for the given attributes value
-
- The attributes value can take one of several shapes. This returns a filter
- function appropriate to the attributes value. One nice thing about this is
- that there's less if/then shenanigans in the ``allow_token`` method.
-
- """
- if callable(attributes):
- return attributes
-
- if isinstance(attributes, dict):
- def _attr_filter(tag, attr, value):
- if tag in attributes:
- attr_val = attributes[tag]
- if callable(attr_val):
- return attr_val(tag, attr, value)
-
- if attr in attr_val:
- return True
-
- if '*' in attributes:
- attr_val = attributes['*']
- if callable(attr_val):
- return attr_val(tag, attr, value)
-
- return attr in attr_val
-
- return False
-
- return _attr_filter
-
- if isinstance(attributes, list):
- def _attr_filter(tag, attr, value):
- return attr in attributes
-
- return _attr_filter
-
- raise ValueError('attributes needs to be a callable, a list or a dict')
-
-
- class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
- """html5lib Filter that sanitizes text
-
- This filter can be used anywhere html5lib filters can be used.
-
- """
- def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
- strip_disallowed_elements=False, strip_html_comments=True,
- **kwargs):
- """Creates a BleachSanitizerFilter instance
-
- :arg Treewalker source: stream
-
- :arg list tags: allowed list of tags; defaults to
- ``bleach.sanitizer.ALLOWED_TAGS``
-
- :arg dict attributes: allowed attributes; can be a callable, list or dict;
- defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
-
- :arg list styles: allowed list of css styles; defaults to
- ``bleach.sanitizer.ALLOWED_STYLES``
-
- :arg list protocols: allowed list of protocols for links; defaults
- to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
-
- :arg bool strip_disallowed_elements: whether or not to strip disallowed
- elements
-
- :arg bool strip_html_comments: whether or not to strip HTML comments
-
- """
- self.attr_filter = attribute_filter_factory(attributes)
- self.strip_disallowed_elements = strip_disallowed_elements
- self.strip_html_comments = strip_html_comments
-
- return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
-
- def sanitize_stream(self, token_iterator):
- for token in token_iterator:
- ret = self.sanitize_token(token)
-
- if not ret:
- continue
-
- if isinstance(ret, list):
- for subtoken in ret:
- yield subtoken
- else:
- yield ret
-
- def merge_characters(self, token_iterator):
- """Merge consecutive Characters tokens in a stream"""
- characters_buffer = []
-
- for token in token_iterator:
- if characters_buffer:
- if token['type'] == 'Characters':
- characters_buffer.append(token)
- continue
- else:
- # Merge all the characters tokens together into one and then
- # operate on it.
- new_token = {
- 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
- 'type': 'Characters'
- }
- characters_buffer = []
- yield new_token
-
- elif token['type'] == 'Characters':
- characters_buffer.append(token)
- continue
-
- yield token
-
- new_token = {
- 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
- 'type': 'Characters'
- }
- yield new_token
-
- def __iter__(self):
- return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
-
- def sanitize_token(self, token):
- """Sanitize a token either by HTML-encoding or dropping.
-
- Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
- ['attribute', 'pairs'], 'tag': callable}.
-
- Here callable is a function with two arguments of attribute name and
- value. It should return true of false.
-
- Also gives the option to strip tags instead of encoding.
-
- :arg dict token: token to sanitize
-
- :returns: token or list of tokens
-
- """
- token_type = token['type']
- if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
- if token['name'] in self.allowed_elements:
- return self.allow_token(token)
-
- elif self.strip_disallowed_elements:
- return None
-
- else:
- if 'data' in token:
- # Alphabetize the attributes before calling .disallowed_token()
- # so that the resulting string is stable
- token['data'] = alphabetize_attributes(token['data'])
- return self.disallowed_token(token)
-
- elif token_type == 'Comment':
- if not self.strip_html_comments:
- return token
- else:
- return None
-
- elif token_type == 'Characters':
- return self.sanitize_characters(token)
-
- else:
- return token
-
- def sanitize_characters(self, token):
- """Handles Characters tokens
-
- Our overridden tokenizer doesn't do anything with entities. However,
- that means that the serializer will convert all ``&`` in Characters
- tokens to ``&``.
-
- Since we don't want that, we extract entities here and convert them to
- Entity tokens so the serializer will let them be.
-
- :arg token: the Characters token to work on
-
- :returns: a list of tokens
-
- """
- data = token.get('data', '')
-
- if not data:
- return token
-
- data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
- token['data'] = data
-
- # If there isn't a & in the data, we can return now
- if '&' not in data:
- return token
-
- new_tokens = []
-
- # For each possible entity that starts with a "&", we try to extract an
- # actual entity and re-tokenize accordingly
- for part in html5lib_shim.next_possible_entity(data):
- if not part:
- continue
-
- if part.startswith('&'):
- entity = html5lib_shim.match_entity(part)
- if entity is not None:
- new_tokens.append({'type': 'Entity', 'name': entity})
- # Length of the entity plus 2--one for & at the beginning
- # and and one for ; at the end
- remainder = part[len(entity) + 2:]
- if remainder:
- new_tokens.append({'type': 'Characters', 'data': remainder})
- continue
-
- new_tokens.append({'type': 'Characters', 'data': part})
-
- return new_tokens
-
- def sanitize_uri_value(self, value, allowed_protocols):
- """Checks a uri value to see if it's allowed
-
- :arg value: the uri value to sanitize
- :arg allowed_protocols: list of allowed protocols
-
- :returns: allowed value or None
-
- """
- # NOTE(willkg): This transforms the value into one that's easier to
- # match and verify, but shouldn't get returned since it's vastly
- # different than the original value.
-
- # Convert all character entities in the value
- new_value = html5lib_shim.convert_entities(value)
-
- # Nix backtick, space characters, and control characters
- new_value = re.sub(
- "[`\000-\040\177-\240\s]+",
- '',
- new_value
- )
-
- # Remove REPLACEMENT characters
- new_value = new_value.replace('\ufffd', '')
-
- # Lowercase it--this breaks the value, but makes it easier to match
- # against
- new_value = new_value.lower()
-
- try:
- # Drop attributes with uri values that have protocols that aren't
- # allowed
- parsed = urlparse(new_value)
- except ValueError:
- # URI is impossible to parse, therefore it's not allowed
- return None
-
- if parsed.scheme:
- # If urlparse found a scheme, check that
- if parsed.scheme in allowed_protocols:
- return value
-
- else:
- # Allow uris that are just an anchor
- if new_value.startswith('#'):
- return value
-
- # Handle protocols that urlparse doesn't recognize like "myprotocol"
- if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
- return value
-
- # If there's no protocol/scheme specified, then assume it's "http"
- # and see if that's allowed
- if 'http' in allowed_protocols:
- return value
-
- return None
-
- def allow_token(self, token):
- """Handles the case where we're allowing the tag"""
- if 'data' in token:
- # Loop through all the attributes and drop the ones that are not
- # allowed, are unsafe or break other rules. Additionally, fix
- # attribute values that need fixing.
- #
- # At the end of this loop, we have the final set of attributes
- # we're keeping.
- attrs = {}
- for namespaced_name, val in token['data'].items():
- namespace, name = namespaced_name
-
- # Drop attributes that are not explicitly allowed
- #
- # NOTE(willkg): We pass in the attribute name--not a namespaced
- # name.
- if not self.attr_filter(token['name'], name, val):
- continue
-
- # Drop attributes with uri values that use a disallowed protocol
- # Sanitize attributes with uri values
- if namespaced_name in self.attr_val_is_uri:
- new_value = self.sanitize_uri_value(val, self.allowed_protocols)
- if new_value is None:
- continue
- val = new_value
-
- # Drop values in svg attrs with non-local IRIs
- if namespaced_name in self.svg_attr_val_allows_ref:
- new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(val))
- new_val = new_val.strip()
- if not new_val:
- continue
-
- else:
- # Replace the val with the unescaped version because
- # it's a iri
- val = new_val
-
- # Drop href and xlink:href attr for svg elements with non-local IRIs
- if (None, token['name']) in self.svg_allow_local_href:
- if namespaced_name in [
- (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
- ]:
- if re.search(r'^\s*[^#\s]', val):
- continue
-
- # If it's a style attribute, sanitize it
- if namespaced_name == (None, u'style'):
- val = self.sanitize_css(val)
-
- # At this point, we want to keep the attribute, so add it in
- attrs[namespaced_name] = val
-
- token['data'] = alphabetize_attributes(attrs)
-
- return token
-
- def disallowed_token(self, token):
- token_type = token["type"]
- if token_type == "EndTag":
- token["data"] = "</%s>" % token["name"]
-
- elif token["data"]:
- assert token_type in ("StartTag", "EmptyTag")
- attrs = []
- for (ns, name), v in token["data"].items():
- # If we end up with a namespace, but no name, switch them so we
- # have a valid name to use.
- if ns and not name:
- ns, name = name, ns
-
- # Figure out namespaced name if the namespace is appropriate
- # and exists; if the ns isn't in prefixes, then drop it.
- if ns is None or ns not in html5lib_shim.prefixes:
- namespaced_name = name
- else:
- namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
-
- attrs.append(' %s="%s"' % (
- namespaced_name,
- # NOTE(willkg): HTMLSerializer escapes attribute values
- # already, so if we do it here (like HTMLSerializer does),
- # then we end up double-escaping.
- v)
- )
- token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
-
- else:
- token["data"] = "<%s>" % token["name"]
-
- if token.get("selfClosing"):
- token["data"] = token["data"][:-1] + "/>"
-
- token["type"] = "Characters"
-
- del token["name"]
- return token
-
- def sanitize_css(self, style):
- """Sanitizes css in style tags"""
- # Convert entities in the style so that it can be parsed as CSS
- style = html5lib_shim.convert_entities(style)
-
- # Drop any url values before we do anything else
- style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
- # The gauntlet of sanitization
-
- # Validate the css in the style tag and if it's not valid, then drop
- # the whole thing.
- parts = style.split(';')
- gauntlet = re.compile(
- r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
- )
-
- for part in parts:
- if not gauntlet.match(part):
- return ''
-
- if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
- return ''
-
- clean = []
- for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
- if not value:
- continue
-
- if prop.lower() in self.allowed_css_properties:
- clean.append(prop + ': ' + value + ';')
-
- elif prop.lower() in self.allowed_svg_properties:
- clean.append(prop + ': ' + value + ';')
-
- return ' '.join(clean)
|