alpcentaur
/
basabuuka_prototyp

from __future__ import unicode_literalsimport reimport six
from bleach import callbacks as linkify_callbacksfrom bleach import html5lib_shimfrom bleach.utils import alphabetize_attributes, force_unicode

#: List of default callbacksDEFAULT_CALLBACKS = [linkify_callbacks.nofollow]

TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws       xn xxx ye yt yu za zm zw""".split()

# Make sure that .com doesn't get matched by .co firstTLDS.reverse()

def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):    """Builds the url regex used by linkifier

   If you want a different set of tlds or allowed protocols, pass those in   and stomp on the existing ``url_re``::
       from bleach import linkifier
       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
       linker = LinkifyFilter(url_re=my_url_re)
    """
    return re.compile(        r"""\(*  # Match any opening parentheses.
        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?            # /path/zz (excluding "unsafe" chars from RFC 1738,            # except for # and ~, which happen in practice)        """.format('|'.join(protocols), '|'.join(tlds)),
        re.IGNORECASE | re.VERBOSE | re.UNICODE)

URL_RE = build_url_re()

PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)

EMAIL_RE = re.compile(    r"""(?<!//)
    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]        |\\[\001-\011\013\014\016-\177])*"  # quoted-string    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain    """,
    re.IGNORECASE | re.MULTILINE | re.VERBOSE)

class Linker(object):    """Convert URL-like strings in an HTML fragment to links

    This function converts strings that look like URLs, domain names and email    addresses in text that may be an HTML fragment to links, while preserving:
    1. links already in the string    2. urls found in attributes    3. email addresses
    linkify does a best-effort approach and tries to recover from bad    situations due to crazy text.
    """
    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,                 url_re=URL_RE, email_re=EMAIL_RE):        """Creates a Linker instance

        :arg list callbacks: list of callbacks to run when adjusting tag attributes;            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
        :arg list skip_tags: list of tags that you don't want to linkify the            contents of; for example, you could set this to ``['pre']`` to skip            linkifying contents of ``pre`` tags
        :arg bool parse_email: whether or not to linkify email addresses
        :arg re url_re: url matching regex
        :arg re email_re: email matching regex
        :returns: linkified text as unicode
        """
        self.callbacks = callbacks        self.skip_tags = skip_tags        self.parse_email = parse_email        self.url_re = url_re        self.email_re = email_re
        # Create a parser/tokenizer that allows all HTML tags and escapes        # anything not in that list.        self.parser = html5lib_shim.BleachHTMLParser(            tags=html5lib_shim.HTML_TAGS,            strip=False,            consume_entities=True,            namespaceHTMLElements=False,        )        self.walker = html5lib_shim.getTreeWalker('etree')        self.serializer = html5lib_shim.BleachHTMLSerializer(            quote_attr_values='always',            omit_optional_tags=False,
            # linkify does not sanitize            sanitize=False,
            # linkify alphabetizes            alphabetical_attributes=False,        )
    def linkify(self, text):        """Linkify specified text

        :arg str text: the text to add links to
        :returns: linkified text as unicode
        :raises TypeError: if ``text`` is not a text type
        """
        if not isinstance(text, six.string_types):            raise TypeError('argument must be of text type')
        text = force_unicode(text)
        if not text:            return u''
        dom = self.parser.parseFragment(text)        filtered = LinkifyFilter(            source=self.walker(dom),            callbacks=self.callbacks,            skip_tags=self.skip_tags,            parse_email=self.parse_email,            url_re=self.url_re,            email_re=self.email_re,        )        return self.serializer.render(filtered)

class LinkifyFilter(html5lib_shim.Filter):    """html5lib filter that linkifies text

    This will do the following:
    * convert email addresses into links    * convert urls into links    * edit existing links by running them through callbacks--the default is to      add a ``rel="nofollow"``
    This filter can be used anywhere html5lib filters can be used.
    """
    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,                 url_re=URL_RE, email_re=EMAIL_RE):        """Creates a LinkifyFilter instance

        :arg TreeWalker source: stream
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
        :arg list skip_tags: list of tags that you don't want to linkify the            contents of; for example, you could set this to ``['pre']`` to skip            linkifying contents of ``pre`` tags
        :arg bool parse_email: whether or not to linkify email addresses
        :arg re url_re: url matching regex
        :arg re email_re: email matching regex
        """
        super(LinkifyFilter, self).__init__(source)
        self.callbacks = callbacks or []        self.skip_tags = skip_tags or []        self.parse_email = parse_email
        self.url_re = url_re        self.email_re = email_re
    def apply_callbacks(self, attrs, is_new):        """Given an attrs dict and an is_new bool, runs through callbacks

        Callbacks can return an adjusted attrs dict or ``None``. In the case of        ``None``, we stop going through callbacks and return that and the link        gets dropped.
        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
        :arg bool is_new: whether or not this link was added by linkify
        :returns: adjusted attrs dict or ``None``
        """
        for cb in self.callbacks:            attrs = cb(attrs, is_new)            if attrs is None:                return None        return attrs
    def extract_character_data(self, token_list):        """Extracts and squashes character sequences in a token stream"""        # FIXME(willkg): This is a terrible idea. What it does is drop all the        # tags from the token list and merge the Characters and SpaceCharacters        # tokens into a single text.        #        # So something like this::        #        #     "<span>" "<b>" "some text" "</b>" "</span>"        #        # gets converted to "some text".        #        # This gets used to figure out the ``_text`` fauxttribute value for        # linkify callables.        #        # I'm not really sure how else to support that ``_text`` fauxttribute and        # maintain some modicum of backwards compatability with previous versions        # of Bleach.
        out = []        for token in token_list:            token_type = token['type']            if token_type in ['Characters', 'SpaceCharacters']:                out.append(token['data'])
        return u''.join(out)
    def handle_email_addresses(self, src_iter):        """Handle email addresses in character tokens"""        for token in src_iter:            if token['type'] == 'Characters':                text = token['data']                new_tokens = []                end = 0
                # For each email address we find in the text                for match in self.email_re.finditer(text):                    if match.start() > end:                        new_tokens.append(                            {u'type': u'Characters', u'data': text[end:match.start()]}                        )
                    # Run attributes through the callbacks to see what we                    # should do with this match                    attrs = {                        (None, u'href'): u'mailto:%s' % match.group(0),                        u'_text': match.group(0)                    }                    attrs = self.apply_callbacks(attrs, True)
                    if attrs is None:                        # Just add the text--but not as a link                        new_tokens.append(                            {u'type': u'Characters', u'data': match.group(0)}                        )
                    else:                        # Add an "a" tag for the new link                        _text = attrs.pop(u'_text', '')                        attrs = alphabetize_attributes(attrs)                        new_tokens.extend([                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},                            {u'type': u'Characters', u'data': force_unicode(_text)},                            {u'type': u'EndTag', u'name': 'a'}                        ])                    end = match.end()
                if new_tokens:                    # Yield the adjusted set of tokens and then continue                    # through the loop                    if end < len(text):                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
                    for new_token in new_tokens:                        yield new_token
                    continue
            yield token
    def strip_non_url_bits(self, fragment):        """Strips non-url bits from the url

        This accounts for over-eager matching by the regex.
        """
        prefix = suffix = ''
        while fragment:            # Try removing ( from the beginning and, if it's balanced, from the            # end, too            if fragment.startswith(u'('):                prefix = prefix + u'('                fragment = fragment[1:]
                if fragment.endswith(u')'):                    suffix = u')' + suffix                    fragment = fragment[:-1]                continue
            # Now try extraneous things from the end. For example, sometimes we            # pick up ) at the end of a url, but the url is in a parenthesized            # phrase like:            #            #     "i looked at the site (at http://example.com)"
            if fragment.endswith(u')') and u'(' not in fragment:                fragment = fragment[:-1]                suffix = u')' + suffix                continue
            # Handle commas            if fragment.endswith(u','):                fragment = fragment[:-1]                suffix = u',' + suffix                continue
            # Handle periods            if fragment.endswith(u'.'):                fragment = fragment[:-1]                suffix = u'.' + suffix                continue
            # Nothing matched, so we're done            break
        return fragment, prefix, suffix
    def handle_links(self, src_iter):        """Handle links in character tokens"""        in_a = False  # happens, if parse_email=True and if a mail was found        for token in src_iter:            if in_a:                if token['type'] == 'EndTag' and token['name'] == 'a':                    in_a = False                yield token                continue            elif token['type'] == 'StartTag' and token['name'] == 'a':                in_a = True                yield token                continue            if token['type'] == 'Characters':                text = token['data']                new_tokens = []                end = 0
                for match in self.url_re.finditer(text):                    if match.start() > end:                        new_tokens.append(                            {u'type': u'Characters', u'data': text[end:match.start()]}                        )
                    url = match.group(0)                    prefix = suffix = ''
                    # Sometimes we pick up too much in the url match, so look for                    # bits we should drop and remove them from the match                    url, prefix, suffix = self.strip_non_url_bits(url)
                    # If there's no protocol, add one                    if PROTO_RE.search(url):                        href = url                    else:                        href = u'http://%s' % url
                    attrs = {                        (None, u'href'): href,                        u'_text': url                    }                    attrs = self.apply_callbacks(attrs, True)
                    if attrs is None:                        # Just add the text                        new_tokens.append(                            {u'type': u'Characters', u'data': prefix + url + suffix}                        )
                    else:                        # Add the "a" tag!                        if prefix:                            new_tokens.append(                                {u'type': u'Characters', u'data': prefix}                            )
                        _text = attrs.pop(u'_text', '')                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},                            {u'type': u'Characters', u'data': force_unicode(_text)},                            {u'type': u'EndTag', u'name': 'a'},                        ])
                        if suffix:                            new_tokens.append(                                {u'type': u'Characters', u'data': suffix}                            )
                    end = match.end()
                if new_tokens:                    # Yield the adjusted set of tokens and then continue                    # through the loop                    if end < len(text):                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
                    for new_token in new_tokens:                        yield new_token
                    continue
            yield token
    def handle_a_tag(self, token_buffer):        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the        callbacks return.
        This yields the new set of tokens.
        """
        a_token = token_buffer[0]        if a_token['data']:            attrs = a_token['data']        else:            attrs = {}        text = self.extract_character_data(token_buffer)        attrs['_text'] = text
        attrs = self.apply_callbacks(attrs, False)
        if attrs is None:            # We're dropping the "a" tag and everything else and replacing            # it with character data. So emit that token.            yield {'type': 'Characters', 'data': text}
        else:            new_text = attrs.pop('_text', '')            a_token['data'] = alphabetize_attributes(attrs)
            if text == new_text:                # The callbacks didn't change the text, so we yield the new "a"                # token, then whatever else was there, then the end "a" token                yield a_token                for mem in token_buffer[1:]:                    yield mem
            else:                # If the callbacks changed the text, then we're going to drop                # all the tokens between the start and end "a" tags and replace                # it with the new text                yield a_token                yield {'type': 'Characters', 'data': force_unicode(new_text)}                yield token_buffer[-1]
    def __iter__(self):        in_a = False        in_skip_tag = None
        token_buffer = []
        for token in super(LinkifyFilter, self).__iter__():            if in_a:                # Handle the case where we're in an "a" tag--we want to buffer tokens                # until we hit an end "a" tag.                if token['type'] == 'EndTag' and token['name'] == 'a':                    # Add the end tag to the token buffer and then handle them                    # and yield anything returned                    token_buffer.append(token)                    for new_token in self.handle_a_tag(token_buffer):                        yield new_token
                    # Clear "a" related state and continue since we've yielded all                    # the tokens we're going to yield                    in_a = False                    token_buffer = []                else:                    token_buffer.append(token)                continue
            if token['type'] in ['StartTag', 'EmptyTag']:                if token['name'] in self.skip_tags:                    # Skip tags start a "special mode" where we don't linkify                    # anything until the end tag.                    in_skip_tag = token['name']
                elif token['name'] == 'a':                    # The "a" tag is special--we switch to a slurp mode and                    # slurp all the tokens until the end "a" tag and then                    # figure out what to do with them there.                    in_a = True                    token_buffer.append(token)
                    # We buffer the start tag, so we don't want to yield it,                    # yet                    continue
            elif in_skip_tag and self.skip_tags:                # NOTE(willkg): We put this clause here since in_a and                # switching in and out of in_a takes precedence.                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:                    in_skip_tag = None
            elif not in_a and not in_skip_tag and token['type'] == 'Characters':                new_stream = iter([token])                if self.parse_email:                    new_stream = self.handle_email_addresses(new_stream)
                new_stream = self.handle_links(new_stream)
                for token in new_stream:                    yield token
                # We've already yielded this token, so continue                continue
            yield token