339 lines
11 KiB
Python
339 lines
11 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
Functions for dealing with markup text
|
||
|
"""
|
||
|
|
||
|
import warnings
|
||
|
import re
|
||
|
import six
|
||
|
from six import moves
|
||
|
|
||
|
from w3lib.util import to_bytes, to_unicode
|
||
|
from w3lib.url import safe_url_string
|
||
|
|
||
|
_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
|
||
|
_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
|
||
|
_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
|
||
|
_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
|
||
|
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
|
||
|
|
||
|
HTML5_WHITESPACE = ' \t\n\r\x0c'
|
||
|
|
||
|
|
||
|
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
|
||
|
r"""
|
||
|
|
||
|
.. warning::
|
||
|
|
||
|
This function is deprecated and will be removed in future.
|
||
|
Please use :func:`replace_entities` instead.
|
||
|
"""
|
||
|
|
||
|
warnings.warn(
|
||
|
"`w3lib.html.remove_entities` function is deprecated and "
|
||
|
"will be removed in future releases. Please use "
|
||
|
"`w3lib.html.replace_entities` instead.",
|
||
|
DeprecationWarning
|
||
|
)
|
||
|
|
||
|
return replace_entities(text, keep, remove_illegal, encoding)
|
||
|
|
||
|
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
|
||
|
u"""Remove entities from the given `text` by converting them to their
|
||
|
corresponding unicode character.
|
||
|
|
||
|
`text` can be a unicode string or a byte string encoded in the given
|
||
|
`encoding` (which defaults to 'utf-8').
|
||
|
|
||
|
If `keep` is passed (with a list of entity names) those entities will
|
||
|
be kept (they won't be removed).
|
||
|
|
||
|
It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
|
||
|
and named entities (such as `` `` or ``>``).
|
||
|
|
||
|
If `remove_illegal` is ``True``, entities that can't be converted are removed.
|
||
|
If `remove_illegal` is ``False``, entities that can't be converted are kept "as
|
||
|
is". For more information see the tests.
|
||
|
|
||
|
Always returns a unicode string (with the entities removed).
|
||
|
|
||
|
>>> import w3lib.html
|
||
|
>>> w3lib.html.replace_entities(b'Price: £100')
|
||
|
u'Price: \\xa3100'
|
||
|
>>> print(w3lib.html.replace_entities(b'Price: £100'))
|
||
|
Price: £100
|
||
|
>>>
|
||
|
|
||
|
"""
|
||
|
|
||
|
def convert_entity(m):
|
||
|
groups = m.groupdict()
|
||
|
if groups.get('dec'):
|
||
|
number = int(groups['dec'], 10)
|
||
|
elif groups.get('hex'):
|
||
|
number = int(groups['hex'], 16)
|
||
|
elif groups.get('named'):
|
||
|
entity_name = groups['named']
|
||
|
if entity_name.lower() in keep:
|
||
|
return m.group(0)
|
||
|
else:
|
||
|
number = (moves.html_entities.name2codepoint.get(entity_name) or
|
||
|
moves.html_entities.name2codepoint.get(entity_name.lower()))
|
||
|
if number is not None:
|
||
|
# Numeric character references in the 80-9F range are typically
|
||
|
# interpreted by browsers as representing the characters mapped
|
||
|
# to bytes 80-9F in the Windows-1252 encoding. For more info
|
||
|
# see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
|
||
|
try:
|
||
|
if 0x80 <= number <= 0x9f:
|
||
|
return six.int2byte(number).decode('cp1252')
|
||
|
else:
|
||
|
return six.unichr(number)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
|
||
|
|
||
|
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
|
||
|
|
||
|
def has_entities(text, encoding=None):
|
||
|
return bool(_ent_re.search(to_unicode(text, encoding)))
|
||
|
|
||
|
def replace_tags(text, token='', encoding=None):
|
||
|
"""Replace all markup tags found in the given `text` by the given token.
|
||
|
By default `token` is an empty string so it just removes all tags.
|
||
|
|
||
|
`text` can be a unicode string or a regular string encoded as `encoding`
|
||
|
(or ``'utf-8'`` if `encoding` is not given.)
|
||
|
|
||
|
Always returns a unicode string.
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
>>> import w3lib.html
|
||
|
>>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
|
||
|
u'This text contains some tag'
|
||
|
>>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
|
||
|
u' -- Je ne parle pas -- fran\\xe7ais -- -- '
|
||
|
>>>
|
||
|
|
||
|
"""
|
||
|
|
||
|
return _tag_re.sub(token, to_unicode(text, encoding))
|
||
|
|
||
|
|
||
|
_REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)
|
||
|
def remove_comments(text, encoding=None):
|
||
|
""" Remove HTML Comments.
|
||
|
|
||
|
>>> import w3lib.html
|
||
|
>>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
|
||
|
u'test whatever'
|
||
|
>>>
|
||
|
|
||
|
"""
|
||
|
|
||
|
text = to_unicode(text, encoding)
|
||
|
return _REMOVECOMMENTS_RE.sub(u'', text)
|
||
|
|
||
|
def remove_tags(text, which_ones=(), keep=(), encoding=None):
|
||
|
""" Remove HTML Tags only.
|
||
|
|
||
|
`which_ones` and `keep` are both tuples, there are four cases:
|
||
|
|
||
|
============== ============= ==========================================
|
||
|
``which_ones`` ``keep`` what it does
|
||
|
============== ============= ==========================================
|
||
|
**not empty** empty remove all tags in ``which_ones``
|
||
|
empty **not empty** remove all tags except the ones in ``keep``
|
||
|
empty empty remove all tags
|
||
|
**not empty** **not empty** not allowed
|
||
|
============== ============= ==========================================
|
||
|
|
||
|
|
||
|
Remove all tags:
|
||
|
|
||
|
>>> import w3lib.html
|
||
|
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
|
||
|
>>> w3lib.html.remove_tags(doc)
|
||
|
u'This is a link: example'
|
||
|
>>>
|
||
|
|
||
|
Keep only some tags:
|
||
|
|
||
|
>>> w3lib.html.remove_tags(doc, keep=('div',))
|
||
|
u'<div>This is a link: example</div>'
|
||
|
>>>
|
||
|
|
||
|
Remove only specific tags:
|
||
|
|
||
|
>>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
|
||
|
u'<div><p>This is a link: example</p></div>'
|
||
|
>>>
|
||
|
|
||
|
You can't remove some and keep some:
|
||
|
|
||
|
>>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
|
||
|
Traceback (most recent call last):
|
||
|
File "<stdin>", line 1, in <module>
|
||
|
File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
|
||
|
assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
|
||
|
AssertionError: which_ones and keep can not be given at the same time
|
||
|
>>>
|
||
|
|
||
|
"""
|
||
|
|
||
|
assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
|
||
|
|
||
|
which_ones = {tag.lower() for tag in which_ones}
|
||
|
keep = {tag.lower() for tag in keep}
|
||
|
|
||
|
def will_remove(tag):
|
||
|
tag = tag.lower()
|
||
|
if which_ones:
|
||
|
return tag in which_ones
|
||
|
else:
|
||
|
return tag not in keep
|
||
|
|
||
|
def remove_tag(m):
|
||
|
tag = m.group(1)
|
||
|
return u'' if will_remove(tag) else m.group(0)
|
||
|
|
||
|
regex = '</?([^ >/]+).*?>'
|
||
|
retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
|
||
|
|
||
|
return retags.sub(remove_tag, to_unicode(text, encoding))
|
||
|
|
||
|
def remove_tags_with_content(text, which_ones=(), encoding=None):
|
||
|
"""Remove tags and their content.
|
||
|
|
||
|
`which_ones` is a tuple of which tags to remove including their content.
|
||
|
If is empty, returns the string unmodified.
|
||
|
|
||
|
>>> import w3lib.html
|
||
|
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
|
||
|
>>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
|
||
|
u'<div><p> <a href="http://www.example.com">example</a></p></div>'
|
||
|
>>>
|
||
|
|
||
|
"""
|
||
|
|
||
|
text = to_unicode(text, encoding)
|
||
|
if which_ones:
|
||
|
tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
|
||
|
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
|
||
|
text = retags.sub(u'', text)
|
||
|
return text
|
||
|
|
||
|
|
||
|
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
|
||
|
encoding=None):
|
||
|
"""Remove escape characters.
|
||
|
|
||
|
`which_ones` is a tuple of which escape characters we want to remove.
|
||
|
By default removes ``\\n``, ``\\t``, ``\\r``.
|
||
|
|
||
|
`replace_by` is the string to replace the escape characters by.
|
||
|
It defaults to ``''``, meaning the escape characters are removed.
|
||
|
|
||
|
"""
|
||
|
|
||
|
text = to_unicode(text, encoding)
|
||
|
for ec in which_ones:
|
||
|
text = text.replace(ec, to_unicode(replace_by, encoding))
|
||
|
return text
|
||
|
|
||
|
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
|
||
|
"""
|
||
|
This function receives markup as a text (always a unicode string or
|
||
|
a UTF-8 encoded string) and does the following:
|
||
|
|
||
|
1. removes entities (except the ones in `keep`) from any part of it
|
||
|
that is not inside a CDATA
|
||
|
2. searches for CDATAs and extracts their text (if any) without modifying it.
|
||
|
3. removes the found CDATAs
|
||
|
|
||
|
"""
|
||
|
|
||
|
def _get_fragments(txt, pattern):
|
||
|
offset = 0
|
||
|
for match in pattern.finditer(txt):
|
||
|
match_s, match_e = match.span(1)
|
||
|
yield txt[offset:match_s]
|
||
|
yield match
|
||
|
offset = match_e
|
||
|
yield txt[offset:]
|
||
|
|
||
|
text = to_unicode(text, encoding)
|
||
|
ret_text = u''
|
||
|
for fragment in _get_fragments(text, _cdata_re):
|
||
|
if isinstance(fragment, six.string_types):
|
||
|
# it's not a CDATA (so we try to remove its entities)
|
||
|
ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
|
||
|
else:
|
||
|
# it's a CDATA (so we just extract its content)
|
||
|
ret_text += fragment.group('cdata_d')
|
||
|
return ret_text
|
||
|
|
||
|
def get_base_url(text, baseurl='', encoding='utf-8'):
|
||
|
"""Return the base url if declared in the given HTML `text`,
|
||
|
relative to the given base url.
|
||
|
|
||
|
If no base url is found, the given `baseurl` is returned.
|
||
|
|
||
|
"""
|
||
|
|
||
|
text = to_unicode(text, encoding)
|
||
|
m = _baseurl_re.search(text)
|
||
|
if m:
|
||
|
return moves.urllib.parse.urljoin(
|
||
|
safe_url_string(baseurl),
|
||
|
safe_url_string(m.group(1), encoding=encoding)
|
||
|
)
|
||
|
else:
|
||
|
return safe_url_string(baseurl)
|
||
|
|
||
|
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
|
||
|
"""Return the http-equiv parameter of the HTML meta element from the given
|
||
|
HTML text and return a tuple ``(interval, url)`` where interval is an integer
|
||
|
containing the delay in seconds (or zero if not present) and url is a
|
||
|
string with the absolute url to redirect.
|
||
|
|
||
|
If no meta redirect is found, ``(None, None)`` is returned.
|
||
|
|
||
|
"""
|
||
|
|
||
|
if six.PY2:
|
||
|
baseurl = to_bytes(baseurl, encoding)
|
||
|
try:
|
||
|
text = to_unicode(text, encoding)
|
||
|
except UnicodeDecodeError:
|
||
|
print(text)
|
||
|
raise
|
||
|
text = remove_tags_with_content(text, ignore_tags)
|
||
|
text = remove_comments(replace_entities(text))
|
||
|
m = _meta_refresh_re.search(text)
|
||
|
if m:
|
||
|
interval = float(m.group('int'))
|
||
|
url = safe_url_string(m.group('url').strip(' "\''), encoding)
|
||
|
url = moves.urllib.parse.urljoin(baseurl, url)
|
||
|
return interval, url
|
||
|
else:
|
||
|
return None, None
|
||
|
|
||
|
|
||
|
def strip_html5_whitespace(text):
|
||
|
r"""
|
||
|
Strip all leading and trailing space characters (as defined in
|
||
|
https://www.w3.org/TR/html5/infrastructure.html#space-character).
|
||
|
|
||
|
Such stripping is useful e.g. for processing HTML element attributes which
|
||
|
contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
|
||
|
defines them as "valid URL potentially surrounded by spaces"
|
||
|
or "valid non-empty URL potentially surrounded by spaces".
|
||
|
|
||
|
>>> strip_html5_whitespace(' hello\n')
|
||
|
'hello'
|
||
|
"""
|
||
|
return text.strip(HTML5_WHITESPACE)
|