alpcentaur
/
basabuuka_prototyp

from __future__ import unicode_literals
from itertools import chainimport re
import sixfrom six.moves.urllib.parse import urlparsefrom xml.sax.saxutils import unescape
from bleach import html5lib_shimfrom bleach.utils import alphabetize_attributes, force_unicode

#: List of allowed tagsALLOWED_TAGS = [    'a',    'abbr',    'acronym',    'b',    'blockquote',    'code',    'em',    'i',    'li',    'ol',    'strong',    'ul',]

#: Map of allowed attributes by tagALLOWED_ATTRIBUTES = {    'a': ['href', 'title'],    'abbr': ['title'],    'acronym': ['title'],}
#: List of allowed stylesALLOWED_STYLES = []
#: List of allowed protocolsALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
#: Regexp for characters that are invisibleINVISIBLE_CHARACTERS_RE = re.compile(    '[' + INVISIBLE_CHARACTERS + ']',    re.UNICODE)
#: String to replace invisible characters with. This can be a character, a#: string, or even a function that takes a Python re matchobjINVISIBLE_REPLACEMENT_CHAR = '?'

class Cleaner(object):    """Cleaner for cleaning HTML fragments of malicious content

    This cleaner is a security-focused function whose sole purpose is to remove    malicious content from a string such that it can be displayed as content in    a web page.
    To use::
        from bleach.sanitizer import Cleaner
        cleaner = Cleaner()
        for text in all_the_yucky_things:            sanitized = cleaner.clean(text)
    .. Note::
       This cleaner is not designed to use to transform content to be used in       non-web-page contexts.
    .. Warning::
       This cleaner is not thread-safe--the html parser has internal state.       Create a separate cleaner per thread!

    """

    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,                 strip_comments=True, filters=None):        """Initializes a Cleaner

        :arg list tags: allowed list of tags; defaults to            ``bleach.sanitizer.ALLOWED_TAGS``
        :arg dict attributes: allowed attributes; can be a callable, list or dict;            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
        :arg list styles: allowed list of css styles; defaults to            ``bleach.sanitizer.ALLOWED_STYLES``
        :arg list protocols: allowed list of protocols for links; defaults            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
        :arg bool strip: whether or not to strip disallowed elements
        :arg bool strip_comments: whether or not to strip HTML comments
        :arg list filters: list of html5lib Filter classes to pass streamed content through
            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
            .. Warning::
               Using filters changes the output of ``bleach.Cleaner.clean``.               Make sure the way the filters change the output are secure.
        """
        self.tags = tags        self.attributes = attributes        self.styles = styles        self.protocols = protocols        self.strip = strip        self.strip_comments = strip_comments        self.filters = filters or []
        self.parser = html5lib_shim.BleachHTMLParser(            tags=self.tags,            strip=self.strip,            consume_entities=False,            namespaceHTMLElements=False        )        self.walker = html5lib_shim.getTreeWalker('etree')        self.serializer = html5lib_shim.BleachHTMLSerializer(            quote_attr_values='always',            omit_optional_tags=False,            escape_lt_in_attrs=True,
            # We want to leave entities as they are without escaping or            # resolving or expanding            resolve_entities=False,
            # Bleach has its own sanitizer, so don't use the html5lib one            sanitize=False,
            # Bleach sanitizer alphabetizes already, so don't use the html5lib one            alphabetical_attributes=False,        )
    def clean(self, text):        """Cleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned
        :returns: sanitized text as unicode
        :raises TypeError: if ``text`` is not a text type
        """
        if not isinstance(text, six.string_types):            message = "argument cannot be of '{name}' type, must be of text type".format(                name=text.__class__.__name__)            raise TypeError(message)
        if not text:            return u''
        text = force_unicode(text)
        dom = self.parser.parseFragment(text)        filtered = BleachSanitizerFilter(            source=self.walker(dom),
            # Bleach-sanitizer-specific things            attributes=self.attributes,            strip_disallowed_elements=self.strip,            strip_html_comments=self.strip_comments,
            # html5lib-sanitizer things            allowed_elements=self.tags,            allowed_css_properties=self.styles,            allowed_protocols=self.protocols,            allowed_svg_properties=[],        )
        # Apply any filters after the BleachSanitizerFilter        for filter_class in self.filters:            filtered = filter_class(source=filtered)
        return self.serializer.render(filtered)

def attribute_filter_factory(attributes):    """Generates attribute filter function for the given attributes value

    The attributes value can take one of several shapes. This returns a filter    function appropriate to the attributes value. One nice thing about this is    that there's less if/then shenanigans in the ``allow_token`` method.
    """
    if callable(attributes):        return attributes
    if isinstance(attributes, dict):        def _attr_filter(tag, attr, value):            if tag in attributes:                attr_val = attributes[tag]                if callable(attr_val):                    return attr_val(tag, attr, value)
                if attr in attr_val:                    return True
            if '*' in attributes:                attr_val = attributes['*']                if callable(attr_val):                    return attr_val(tag, attr, value)
                return attr in attr_val
            return False
        return _attr_filter
    if isinstance(attributes, list):        def _attr_filter(tag, attr, value):            return attr in attributes
        return _attr_filter
    raise ValueError('attributes needs to be a callable, a list or a dict')

class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):    """html5lib Filter that sanitizes text

    This filter can be used anywhere html5lib filters can be used.
    """
    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,                 strip_disallowed_elements=False, strip_html_comments=True,                 **kwargs):        """Creates a BleachSanitizerFilter instance

        :arg Treewalker source: stream
        :arg list tags: allowed list of tags; defaults to            ``bleach.sanitizer.ALLOWED_TAGS``
        :arg dict attributes: allowed attributes; can be a callable, list or dict;            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
        :arg list styles: allowed list of css styles; defaults to            ``bleach.sanitizer.ALLOWED_STYLES``
        :arg list protocols: allowed list of protocols for links; defaults            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
        :arg bool strip_disallowed_elements: whether or not to strip disallowed            elements
        :arg bool strip_html_comments: whether or not to strip HTML comments
        """
        self.attr_filter = attribute_filter_factory(attributes)        self.strip_disallowed_elements = strip_disallowed_elements        self.strip_html_comments = strip_html_comments
        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
    def sanitize_stream(self, token_iterator):        for token in token_iterator:            ret = self.sanitize_token(token)
            if not ret:                continue
            if isinstance(ret, list):                for subtoken in ret:                    yield subtoken            else:                yield ret
    def merge_characters(self, token_iterator):        """Merge consecutive Characters tokens in a stream"""        characters_buffer = []
        for token in token_iterator:            if characters_buffer:                if token['type'] == 'Characters':                    characters_buffer.append(token)                    continue                else:                    # Merge all the characters tokens together into one and then                    # operate on it.                    new_token = {                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),                        'type': 'Characters'                    }                    characters_buffer = []                    yield new_token
            elif token['type'] == 'Characters':                characters_buffer.append(token)                continue
            yield token
        new_token = {            'data': ''.join([char_token['data'] for char_token in characters_buffer]),            'type': 'Characters'        }        yield new_token
    def __iter__(self):        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
    def sanitize_token(self, token):        """Sanitize a token either by HTML-encoding or dropping.

        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':        ['attribute', 'pairs'], 'tag': callable}.
        Here callable is a function with two arguments of attribute name and        value. It should return true of false.
        Also gives the option to strip tags instead of encoding.
        :arg dict token: token to sanitize
        :returns: token or list of tokens
        """
        token_type = token['type']        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:            if token['name'] in self.allowed_elements:                return self.allow_token(token)
            elif self.strip_disallowed_elements:                return None
            else:                if 'data' in token:                    # Alphabetize the attributes before calling .disallowed_token()                    # so that the resulting string is stable                    token['data'] = alphabetize_attributes(token['data'])                return self.disallowed_token(token)
        elif token_type == 'Comment':            if not self.strip_html_comments:                return token            else:                return None
        elif token_type == 'Characters':            return self.sanitize_characters(token)
        else:            return token
    def sanitize_characters(self, token):        """Handles Characters tokens

        Our overridden tokenizer doesn't do anything with entities. However,        that means that the serializer will convert all ``&`` in Characters        tokens to ``&amp;``.
        Since we don't want that, we extract entities here and convert them to        Entity tokens so the serializer will let them be.
        :arg token: the Characters token to work on
        :returns: a list of tokens
        """
        data = token.get('data', '')
        if not data:            return token
        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)        token['data'] = data
        # If there isn't a & in the data, we can return now        if '&' not in data:            return token
        new_tokens = []
        # For each possible entity that starts with a "&", we try to extract an        # actual entity and re-tokenize accordingly        for part in html5lib_shim.next_possible_entity(data):            if not part:                continue
            if part.startswith('&'):                entity = html5lib_shim.match_entity(part)                if entity is not None:                    new_tokens.append({'type': 'Entity', 'name': entity})                    # Length of the entity plus 2--one for & at the beginning                    # and and one for ; at the end                    remainder = part[len(entity) + 2:]                    if remainder:                        new_tokens.append({'type': 'Characters', 'data': remainder})                    continue
            new_tokens.append({'type': 'Characters', 'data': part})
        return new_tokens
    def sanitize_uri_value(self, value, allowed_protocols):        """Checks a uri value to see if it's allowed

        :arg value: the uri value to sanitize        :arg allowed_protocols: list of allowed protocols
        :returns: allowed value or None
        """
        # NOTE(willkg): This transforms the value into one that's easier to        # match and verify, but shouldn't get returned since it's vastly        # different than the original value.
        # Convert all character entities in the value        new_value = html5lib_shim.convert_entities(value)
        # Nix backtick, space characters, and control characters        new_value = re.sub(            "[`\000-\040\177-\240\s]+",            '',            new_value        )
        # Remove REPLACEMENT characters        new_value = new_value.replace('\ufffd', '')
        # Lowercase it--this breaks the value, but makes it easier to match        # against        new_value = new_value.lower()
        try:            # Drop attributes with uri values that have protocols that aren't            # allowed            parsed = urlparse(new_value)        except ValueError:            # URI is impossible to parse, therefore it's not allowed            return None
        if parsed.scheme:            # If urlparse found a scheme, check that            if parsed.scheme in allowed_protocols:                return value
        else:            # Allow uris that are just an anchor            if new_value.startswith('#'):                return value
            # Handle protocols that urlparse doesn't recognize like "myprotocol"            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:                return value
            # If there's no protocol/scheme specified, then assume it's "http"            # and see if that's allowed            if 'http' in allowed_protocols:                return value
        return None
    def allow_token(self, token):        """Handles the case where we're allowing the tag"""        if 'data' in token:            # Loop through all the attributes and drop the ones that are not            # allowed, are unsafe or break other rules. Additionally, fix            # attribute values that need fixing.            #            # At the end of this loop, we have the final set of attributes            # we're keeping.            attrs = {}            for namespaced_name, val in token['data'].items():                namespace, name = namespaced_name
                # Drop attributes that are not explicitly allowed                #                # NOTE(willkg): We pass in the attribute name--not a namespaced                # name.                if not self.attr_filter(token['name'], name, val):                    continue
                # Drop attributes with uri values that use a disallowed protocol                # Sanitize attributes with uri values                if namespaced_name in self.attr_val_is_uri:                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)                    if new_value is None:                        continue                    val = new_value
                # Drop values in svg attrs with non-local IRIs                if namespaced_name in self.svg_attr_val_allows_ref:                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',                                     ' ',                                     unescape(val))                    new_val = new_val.strip()                    if not new_val:                        continue
                    else:                        # Replace the val with the unescaped version because                        # it's a iri                        val = new_val
                # Drop href and xlink:href attr for svg elements with non-local IRIs                if (None, token['name']) in self.svg_allow_local_href:                    if namespaced_name in [                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')                    ]:                        if re.search(r'^\s*[^#\s]', val):                            continue
                # If it's a style attribute, sanitize it                if namespaced_name == (None, u'style'):                    val = self.sanitize_css(val)
                # At this point, we want to keep the attribute, so add it in                attrs[namespaced_name] = val
            token['data'] = alphabetize_attributes(attrs)
        return token
    def disallowed_token(self, token):        token_type = token["type"]        if token_type == "EndTag":            token["data"] = "</%s>" % token["name"]
        elif token["data"]:            assert token_type in ("StartTag", "EmptyTag")            attrs = []            for (ns, name), v in token["data"].items():                # If we end up with a namespace, but no name, switch them so we                # have a valid name to use.                if ns and not name:                    ns, name = name, ns
                # Figure out namespaced name if the namespace is appropriate                # and exists; if the ns isn't in prefixes, then drop it.                if ns is None or ns not in html5lib_shim.prefixes:                    namespaced_name = name                else:                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
                attrs.append(' %s="%s"' % (                    namespaced_name,                    # NOTE(willkg): HTMLSerializer escapes attribute values                    # already, so if we do it here (like HTMLSerializer does),                    # then we end up double-escaping.                    v)                )            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
        else:            token["data"] = "<%s>" % token["name"]
        if token.get("selfClosing"):            token["data"] = token["data"][:-1] + "/>"
        token["type"] = "Characters"
        del token["name"]        return token
    def sanitize_css(self, style):        """Sanitizes css in style tags"""        # Convert entities in the style so that it can be parsed as CSS        style = html5lib_shim.convert_entities(style)
        # Drop any url values before we do anything else        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # The gauntlet of sanitization
        # Validate the css in the style tag and if it's not valid, then drop        # the whole thing.        parts = style.split(';')        gauntlet = re.compile(            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""        )
        for part in parts:            if not gauntlet.match(part):                return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):            return ''
        clean = []        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):            if not value:                continue
            if prop.lower() in self.allowed_css_properties:                clean.append(prop + ': ' + value + ';')
            elif prop.lower() in self.allowed_svg_properties:                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)