alpcentaur
/
basabuuka_prototyp

# -*- coding: utf-8 -*-"""
Functions for dealing with markup text"""

import warningsimport reimport sixfrom six import moves
from w3lib.util import to_bytes, to_unicodefrom w3lib.url import safe_url_string
_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
HTML5_WHITESPACE = ' \t\n\r\x0c'

def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):    r"""

    .. warning::
        This function is deprecated and will be removed in future.        Please use :func:`replace_entities` instead.    """

    warnings.warn(        "`w3lib.html.remove_entities` function is deprecated and "        "will be removed in future releases. Please use "        "`w3lib.html.replace_entities` instead.",        DeprecationWarning    )
    return replace_entities(text, keep, remove_illegal, encoding)
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):    u"""Remove entities from the given `text` by converting them to their
    corresponding unicode character.
    `text` can be a unicode string or a byte string encoded in the given    `encoding` (which defaults to 'utf-8').
    If `keep` is passed (with a list of entity names) those entities will    be kept (they won't be removed).
    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)    and named entities (such as ``&nbsp;`` or ``&gt;``).
    If `remove_illegal` is ``True``, entities that can't be converted are removed.    If `remove_illegal` is ``False``, entities that can't be converted are kept "as    is". For more information see the tests.
    Always returns a unicode string (with the entities removed).
    >>> import w3lib.html    >>> w3lib.html.replace_entities(b'Price: &pound;100')    u'Price: \\xa3100'    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))    Price: £100    >>>
    """

    def convert_entity(m):        groups = m.groupdict()        if groups.get('dec'):            number = int(groups['dec'], 10)        elif groups.get('hex'):            number = int(groups['hex'], 16)        elif groups.get('named'):            entity_name = groups['named']            if entity_name.lower() in keep:                return m.group(0)            else:                number = (moves.html_entities.name2codepoint.get(entity_name) or                    moves.html_entities.name2codepoint.get(entity_name.lower()))        if number is not None:            # Numeric character references in the 80-9F range are typically            # interpreted by browsers as representing the characters mapped            # to bytes 80-9F in the Windows-1252 encoding. For more info            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML            try:                if 0x80 <= number <= 0x9f:                    return six.int2byte(number).decode('cp1252')                else:                    return six.unichr(number)            except ValueError:                pass
        return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
def has_entities(text, encoding=None):    return bool(_ent_re.search(to_unicode(text, encoding)))
def replace_tags(text, token='', encoding=None):    """Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.
    `text` can be a unicode string or a regular string encoded as `encoding`    (or ``'utf-8'`` if `encoding` is not given.)
    Always returns a unicode string.
    Examples:
    >>> import w3lib.html    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')    u'This text contains some tag'    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')    u' -- Je ne parle pas  -- fran\\xe7ais --  -- '    >>>
    """

    return _tag_re.sub(token, to_unicode(text, encoding))

_REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)def remove_comments(text, encoding=None):    """ Remove HTML Comments.

    >>> import w3lib.html    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")    u'test  whatever'    >>>
    """

    text = to_unicode(text, encoding)    return _REMOVECOMMENTS_RE.sub(u'', text)
def remove_tags(text, which_ones=(), keep=(), encoding=None):    """ Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:
    ==============  ============= ==========================================    ``which_ones``  ``keep``      what it does    ==============  ============= ==========================================    **not empty**   empty         remove all tags in ``which_ones``    empty           **not empty** remove all tags except the ones in ``keep``    empty           empty         remove all tags    **not empty**   **not empty** not allowed    ==============  ============= ==========================================

    Remove all tags:
    >>> import w3lib.html    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'    >>> w3lib.html.remove_tags(doc)    u'This is a link: example'    >>>
    Keep only some tags:
    >>> w3lib.html.remove_tags(doc, keep=('div',))    u'<div>This is a link: example</div>'    >>>
    Remove only specific tags:
    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))    u'<div><p>This is a link: example</p></div>'    >>>
    You can't remove some and keep some:
    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))    Traceback (most recent call last):      File "<stdin>", line 1, in <module>      File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags        assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'    AssertionError: which_ones and keep can not be given at the same time    >>>
    """

    assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
    which_ones = {tag.lower() for tag in which_ones}    keep = {tag.lower() for tag in keep}
    def will_remove(tag):        tag = tag.lower()        if which_ones:            return tag in which_ones        else:            return tag not in keep
    def remove_tag(m):        tag = m.group(1)        return u'' if will_remove(tag) else m.group(0)
    regex = '</?([^ >/]+).*?>'    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
    return retags.sub(remove_tag, to_unicode(text, encoding))
def remove_tags_with_content(text, which_ones=(), encoding=None):    """Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.    If is empty, returns the string unmodified.
    >>> import w3lib.html    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))    u'<div><p> <a href="http://www.example.com">example</a></p></div>'    >>>
    """

    text = to_unicode(text, encoding)    if which_ones:        tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)        text = retags.sub(u'', text)    return text

def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \        encoding=None):    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.    By default removes ``\\n``, ``\\t``, ``\\r``.
    `replace_by` is the string to replace the escape characters by.    It defaults to ``''``, meaning the escape characters are removed.
    """

    text = to_unicode(text, encoding)    for ec in which_ones:        text = text.replace(ec, to_unicode(replace_by, encoding))    return text
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):    """
    This function receives markup as a text (always a unicode string or    a UTF-8 encoded string) and does the following:
    1. removes entities (except the ones in `keep`) from any part of it        that is not inside a CDATA    2. searches for CDATAs and extracts their text (if any) without modifying it.    3. removes the found CDATAs
    """

    def _get_fragments(txt, pattern):        offset = 0        for match in pattern.finditer(txt):            match_s, match_e = match.span(1)            yield txt[offset:match_s]            yield match            offset = match_e        yield txt[offset:]
    text = to_unicode(text, encoding)    ret_text = u''    for fragment in _get_fragments(text, _cdata_re):        if isinstance(fragment, six.string_types):            # it's not a CDATA (so we try to remove its entities)            ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)        else:            # it's a CDATA (so we just extract its content)            ret_text += fragment.group('cdata_d')    return ret_text
def get_base_url(text, baseurl='', encoding='utf-8'):    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.
    If no base url is found, the given `baseurl` is returned.
    """

    text = to_unicode(text, encoding)    m = _baseurl_re.search(text)    if m:        return moves.urllib.parse.urljoin(            safe_url_string(baseurl),            safe_url_string(m.group(1), encoding=encoding)        )    else:        return safe_url_string(baseurl)
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer    containing the delay in seconds (or zero if not present) and url is a    string with the absolute url to redirect.
    If no meta redirect is found, ``(None, None)`` is returned.
    """

    if six.PY2:        baseurl = to_bytes(baseurl, encoding)    try:        text = to_unicode(text, encoding)    except UnicodeDecodeError:        print(text)        raise    text = remove_tags_with_content(text, ignore_tags)    text = remove_comments(replace_entities(text))    m = _meta_refresh_re.search(text)    if m:        interval = float(m.group('int'))        url = safe_url_string(m.group('url').strip(' "\''), encoding)        url = moves.urllib.parse.urljoin(baseurl, url)        return interval, url    else:        return None, None

def strip_html5_whitespace(text):    r"""
    Strip all leading and trailing space characters (as defined in    https://www.w3.org/TR/html5/infrastructure.html#space-character).
    Such stripping is useful e.g. for processing HTML element attributes which    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard    defines them as "valid URL potentially surrounded by spaces"    or "valid non-empty URL potentially surrounded by spaces".
    >>> strip_html5_whitespace(' hello\n')    'hello'    """
    return text.strip(HTML5_WHITESPACE)