alpcentaur
/
basabuuka_prototyp


								from __future__ import unicode_literals


								from itertools import chain

								import re


								import six

								from six.moves.urllib.parse import urlparse

								from xml.sax.saxutils import unescape


								from bleach import html5lib_shim

								from bleach.utils import alphabetize_attributes, force_unicode


								#: List of allowed tags

								ALLOWED_TAGS = [

								    'a',

								    'abbr',

								    'acronym',

								    'b',

								    'blockquote',

								    'code',

								    'em',

								    'i',

								    'li',

								    'ol',

								    'strong',

								    'ul',

								]


								#: Map of allowed attributes by tag

								ALLOWED_ATTRIBUTES = {

								    'a': ['href', 'title'],

								    'abbr': ['title'],

								    'acronym': ['title'],

								}


								#: List of allowed styles

								ALLOWED_STYLES = []


								#: List of allowed protocols

								ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']


								#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)

								INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])


								#: Regexp for characters that are invisible

								INVISIBLE_CHARACTERS_RE = re.compile(

								    '[' + INVISIBLE_CHARACTERS + ']',

								    re.UNICODE

								)


								#: String to replace invisible characters with. This can be a character, a

								#: string, or even a function that takes a Python re matchobj

								INVISIBLE_REPLACEMENT_CHAR = '?'


								class Cleaner(object):

								    """Cleaner for cleaning HTML fragments of malicious content


								    This cleaner is a security-focused function whose sole purpose is to remove

								    malicious content from a string such that it can be displayed as content in

								    a web page.


								    To use::


								        from bleach.sanitizer import Cleaner


								        cleaner = Cleaner()


								        for text in all_the_yucky_things:

								            sanitized = cleaner.clean(text)


								    .. Note::


								       This cleaner is not designed to use to transform content to be used in

								       non-web-page contexts.


								    .. Warning::


								       This cleaner is not thread-safe--the html parser has internal state.

								       Create a separate cleaner per thread!


								    """


								    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,

								                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,

								                 strip_comments=True, filters=None):

								        """Initializes a Cleaner


								        :arg list tags: allowed list of tags; defaults to

								            ``bleach.sanitizer.ALLOWED_TAGS``


								        :arg dict attributes: allowed attributes; can be a callable, list or dict;

								            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``


								        :arg list styles: allowed list of css styles; defaults to

								            ``bleach.sanitizer.ALLOWED_STYLES``


								        :arg list protocols: allowed list of protocols for links; defaults

								            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``


								        :arg bool strip: whether or not to strip disallowed elements


								        :arg bool strip_comments: whether or not to strip HTML comments


								        :arg list filters: list of html5lib Filter classes to pass streamed content through


								            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters


								            .. Warning::


								               Using filters changes the output of ``bleach.Cleaner.clean``.

								               Make sure the way the filters change the output are secure.


								        """

								        self.tags = tags

								        self.attributes = attributes

								        self.styles = styles

								        self.protocols = protocols

								        self.strip = strip

								        self.strip_comments = strip_comments

								        self.filters = filters or []


								        self.parser = html5lib_shim.BleachHTMLParser(

								            tags=self.tags,

								            strip=self.strip,

								            consume_entities=False,

								            namespaceHTMLElements=False

								        )

								        self.walker = html5lib_shim.getTreeWalker('etree')

								        self.serializer = html5lib_shim.BleachHTMLSerializer(

								            quote_attr_values='always',

								            omit_optional_tags=False,

								            escape_lt_in_attrs=True,


								            # We want to leave entities as they are without escaping or

								            # resolving or expanding

								            resolve_entities=False,


								            # Bleach has its own sanitizer, so don't use the html5lib one

								            sanitize=False,


								            # Bleach sanitizer alphabetizes already, so don't use the html5lib one

								            alphabetical_attributes=False,

								        )


								    def clean(self, text):

								        """Cleans text and returns sanitized result as unicode


								        :arg str text: text to be cleaned


								        :returns: sanitized text as unicode


								        :raises TypeError: if ``text`` is not a text type


								        """

								        if not isinstance(text, six.string_types):

								            message = "argument cannot be of '{name}' type, must be of text type".format(

								                name=text.__class__.__name__)

								            raise TypeError(message)


								        if not text:

								            return u''


								        text = force_unicode(text)


								        dom = self.parser.parseFragment(text)

								        filtered = BleachSanitizerFilter(

								            source=self.walker(dom),


								            # Bleach-sanitizer-specific things

								            attributes=self.attributes,

								            strip_disallowed_elements=self.strip,

								            strip_html_comments=self.strip_comments,


								            # html5lib-sanitizer things

								            allowed_elements=self.tags,

								            allowed_css_properties=self.styles,

								            allowed_protocols=self.protocols,

								            allowed_svg_properties=[],

								        )


								        # Apply any filters after the BleachSanitizerFilter

								        for filter_class in self.filters:

								            filtered = filter_class(source=filtered)


								        return self.serializer.render(filtered)


								def attribute_filter_factory(attributes):

								    """Generates attribute filter function for the given attributes value


								    The attributes value can take one of several shapes. This returns a filter

								    function appropriate to the attributes value. One nice thing about this is

								    that there's less if/then shenanigans in the ``allow_token`` method.


								    """

								    if callable(attributes):

								        return attributes


								    if isinstance(attributes, dict):

								        def _attr_filter(tag, attr, value):

								            if tag in attributes:

								                attr_val = attributes[tag]

								                if callable(attr_val):

								                    return attr_val(tag, attr, value)


								                if attr in attr_val:

								                    return True


								            if '*' in attributes:

								                attr_val = attributes['*']

								                if callable(attr_val):

								                    return attr_val(tag, attr, value)


								                return attr in attr_val


								            return False


								        return _attr_filter


								    if isinstance(attributes, list):

								        def _attr_filter(tag, attr, value):

								            return attr in attributes


								        return _attr_filter


								    raise ValueError('attributes needs to be a callable, a list or a dict')


								class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):

								    """html5lib Filter that sanitizes text


								    This filter can be used anywhere html5lib filters can be used.


								    """

								    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,

								                 strip_disallowed_elements=False, strip_html_comments=True,

								                 **kwargs):

								        """Creates a BleachSanitizerFilter instance


								        :arg Treewalker source: stream


								        :arg list tags: allowed list of tags; defaults to

								            ``bleach.sanitizer.ALLOWED_TAGS``


								        :arg dict attributes: allowed attributes; can be a callable, list or dict;

								            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``


								        :arg list styles: allowed list of css styles; defaults to

								            ``bleach.sanitizer.ALLOWED_STYLES``


								        :arg list protocols: allowed list of protocols for links; defaults

								            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``


								        :arg bool strip_disallowed_elements: whether or not to strip disallowed

								            elements


								        :arg bool strip_html_comments: whether or not to strip HTML comments


								        """

								        self.attr_filter = attribute_filter_factory(attributes)

								        self.strip_disallowed_elements = strip_disallowed_elements

								        self.strip_html_comments = strip_html_comments


								        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)


								    def sanitize_stream(self, token_iterator):

								        for token in token_iterator:

								            ret = self.sanitize_token(token)


								            if not ret:

								                continue


								            if isinstance(ret, list):

								                for subtoken in ret:

								                    yield subtoken

								            else:

								                yield ret


								    def merge_characters(self, token_iterator):

								        """Merge consecutive Characters tokens in a stream"""

								        characters_buffer = []


								        for token in token_iterator:

								            if characters_buffer:

								                if token['type'] == 'Characters':

								                    characters_buffer.append(token)

								                    continue

								                else:

								                    # Merge all the characters tokens together into one and then

								                    # operate on it.

								                    new_token = {

								                        'data': ''.join([char_token['data'] for char_token in characters_buffer]),

								                        'type': 'Characters'

								                    }

								                    characters_buffer = []

								                    yield new_token


								            elif token['type'] == 'Characters':

								                characters_buffer.append(token)

								                continue


								            yield token


								        new_token = {

								            'data': ''.join([char_token['data'] for char_token in characters_buffer]),

								            'type': 'Characters'

								        }

								        yield new_token


								    def __iter__(self):

								        return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))


								    def sanitize_token(self, token):

								        """Sanitize a token either by HTML-encoding or dropping.


								        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':

								        ['attribute', 'pairs'], 'tag': callable}.


								        Here callable is a function with two arguments of attribute name and

								        value. It should return true of false.


								        Also gives the option to strip tags instead of encoding.


								        :arg dict token: token to sanitize


								        :returns: token or list of tokens


								        """

								        token_type = token['type']

								        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:

								            if token['name'] in self.allowed_elements:

								                return self.allow_token(token)


								            elif self.strip_disallowed_elements:

								                return None


								            else:

								                if 'data' in token:

								                    # Alphabetize the attributes before calling .disallowed_token()

								                    # so that the resulting string is stable

								                    token['data'] = alphabetize_attributes(token['data'])

								                return self.disallowed_token(token)


								        elif token_type == 'Comment':

								            if not self.strip_html_comments:

								                return token

								            else:

								                return None


								        elif token_type == 'Characters':

								            return self.sanitize_characters(token)


								        else:

								            return token


								    def sanitize_characters(self, token):

								        """Handles Characters tokens


								        Our overridden tokenizer doesn't do anything with entities. However,

								        that means that the serializer will convert all ``&`` in Characters

								        tokens to ``&amp;``.


								        Since we don't want that, we extract entities here and convert them to

								        Entity tokens so the serializer will let them be.


								        :arg token: the Characters token to work on


								        :returns: a list of tokens


								        """

								        data = token.get('data', '')


								        if not data:

								            return token


								        data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)

								        token['data'] = data


								        # If there isn't a & in the data, we can return now

								        if '&' not in data:

								            return token


								        new_tokens = []


								        # For each possible entity that starts with a "&", we try to extract an

								        # actual entity and re-tokenize accordingly

								        for part in html5lib_shim.next_possible_entity(data):

								            if not part:

								                continue


								            if part.startswith('&'):

								                entity = html5lib_shim.match_entity(part)

								                if entity is not None:

								                    new_tokens.append({'type': 'Entity', 'name': entity})

								                    # Length of the entity plus 2--one for & at the beginning

								                    # and and one for ; at the end

								                    remainder = part[len(entity) + 2:]

								                    if remainder:

								                        new_tokens.append({'type': 'Characters', 'data': remainder})

								                    continue


								            new_tokens.append({'type': 'Characters', 'data': part})


								        return new_tokens


								    def sanitize_uri_value(self, value, allowed_protocols):

								        """Checks a uri value to see if it's allowed


								        :arg value: the uri value to sanitize

								        :arg allowed_protocols: list of allowed protocols


								        :returns: allowed value or None


								        """

								        # NOTE(willkg): This transforms the value into one that's easier to

								        # match and verify, but shouldn't get returned since it's vastly

								        # different than the original value.


								        # Convert all character entities in the value

								        new_value = html5lib_shim.convert_entities(value)


								        # Nix backtick, space characters, and control characters

								        new_value = re.sub(

								            "[`\000-\040\177-\240\s]+",

								            '',

								            new_value

								        )


								        # Remove REPLACEMENT characters

								        new_value = new_value.replace('\ufffd', '')


								        # Lowercase it--this breaks the value, but makes it easier to match

								        # against

								        new_value = new_value.lower()


								        try:

								            # Drop attributes with uri values that have protocols that aren't

								            # allowed

								            parsed = urlparse(new_value)

								        except ValueError:

								            # URI is impossible to parse, therefore it's not allowed

								            return None


								        if parsed.scheme:

								            # If urlparse found a scheme, check that

								            if parsed.scheme in allowed_protocols:

								                return value


								        else:

								            # Allow uris that are just an anchor

								            if new_value.startswith('#'):

								                return value


								            # Handle protocols that urlparse doesn't recognize like "myprotocol"

								            if ':' in new_value and new_value.split(':')[0] in allowed_protocols:

								                return value


								            # If there's no protocol/scheme specified, then assume it's "http"

								            # and see if that's allowed

								            if 'http' in allowed_protocols:

								                return value


								        return None


								    def allow_token(self, token):

								        """Handles the case where we're allowing the tag"""

								        if 'data' in token:

								            # Loop through all the attributes and drop the ones that are not

								            # allowed, are unsafe or break other rules. Additionally, fix

								            # attribute values that need fixing.

								            #

								            # At the end of this loop, we have the final set of attributes

								            # we're keeping.

								            attrs = {}

								            for namespaced_name, val in token['data'].items():

								                namespace, name = namespaced_name


								                # Drop attributes that are not explicitly allowed

								                #

								                # NOTE(willkg): We pass in the attribute name--not a namespaced

								                # name.

								                if not self.attr_filter(token['name'], name, val):

								                    continue


								                # Drop attributes with uri values that use a disallowed protocol

								                # Sanitize attributes with uri values

								                if namespaced_name in self.attr_val_is_uri:

								                    new_value = self.sanitize_uri_value(val, self.allowed_protocols)

								                    if new_value is None:

								                        continue

								                    val = new_value


								                # Drop values in svg attrs with non-local IRIs

								                if namespaced_name in self.svg_attr_val_allows_ref:

								                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

								                                     ' ',

								                                     unescape(val))

								                    new_val = new_val.strip()

								                    if not new_val:

								                        continue


								                    else:

								                        # Replace the val with the unescaped version because

								                        # it's a iri

								                        val = new_val


								                # Drop href and xlink:href attr for svg elements with non-local IRIs

								                if (None, token['name']) in self.svg_allow_local_href:

								                    if namespaced_name in [

								                            (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')

								                    ]:

								                        if re.search(r'^\s*[^#\s]', val):

								                            continue


								                # If it's a style attribute, sanitize it

								                if namespaced_name == (None, u'style'):

								                    val = self.sanitize_css(val)


								                # At this point, we want to keep the attribute, so add it in

								                attrs[namespaced_name] = val


								            token['data'] = alphabetize_attributes(attrs)


								        return token


								    def disallowed_token(self, token):

								        token_type = token["type"]

								        if token_type == "EndTag":

								            token["data"] = "</%s>" % token["name"]


								        elif token["data"]:

								            assert token_type in ("StartTag", "EmptyTag")

								            attrs = []

								            for (ns, name), v in token["data"].items():

								                # If we end up with a namespace, but no name, switch them so we

								                # have a valid name to use.

								                if ns and not name:

								                    ns, name = name, ns


								                # Figure out namespaced name if the namespace is appropriate

								                # and exists; if the ns isn't in prefixes, then drop it.

								                if ns is None or ns not in html5lib_shim.prefixes:

								                    namespaced_name = name

								                else:

								                    namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)


								                attrs.append(' %s="%s"' % (

								                    namespaced_name,

								                    # NOTE(willkg): HTMLSerializer escapes attribute values

								                    # already, so if we do it here (like HTMLSerializer does),

								                    # then we end up double-escaping.

								                    v)

								                )

								            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))


								        else:

								            token["data"] = "<%s>" % token["name"]


								        if token.get("selfClosing"):

								            token["data"] = token["data"][:-1] + "/>"


								        token["type"] = "Characters"


								        del token["name"]

								        return token


								    def sanitize_css(self, style):

								        """Sanitizes css in style tags"""

								        # Convert entities in the style so that it can be parsed as CSS

								        style = html5lib_shim.convert_entities(style)


								        # Drop any url values before we do anything else

								        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)


								        # The gauntlet of sanitization


								        # Validate the css in the style tag and if it's not valid, then drop

								        # the whole thing.

								        parts = style.split(';')

								        gauntlet = re.compile(

								            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""

								        )


								        for part in parts:

								            if not gauntlet.match(part):

								                return ''


								        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):

								            return ''


								        clean = []

								        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):

								            if not value:

								                continue


								            if prop.lower() in self.allowed_css_properties:

								                clean.append(prop + ': ' + value + ';')


								            elif prop.lower() in self.allowed_svg_properties:

								                clean.append(prop + ': ' + value + ';')


								        return ' '.join(clean)