You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

338 lines
11 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. """
  3. Functions for dealing with markup text
  4. """
  5. import warnings
  6. import re
  7. import six
  8. from six import moves
  9. from w3lib.util import to_bytes, to_unicode
  10. from w3lib.url import safe_url_string
  11. _ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
  12. _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
  13. _baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
  14. _meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
  15. _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
  16. HTML5_WHITESPACE = ' \t\n\r\x0c'
  17. def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
  18. r"""
  19. .. warning::
  20. This function is deprecated and will be removed in future.
  21. Please use :func:`replace_entities` instead.
  22. """
  23. warnings.warn(
  24. "`w3lib.html.remove_entities` function is deprecated and "
  25. "will be removed in future releases. Please use "
  26. "`w3lib.html.replace_entities` instead.",
  27. DeprecationWarning
  28. )
  29. return replace_entities(text, keep, remove_illegal, encoding)
  30. def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
  31. u"""Remove entities from the given `text` by converting them to their
  32. corresponding unicode character.
  33. `text` can be a unicode string or a byte string encoded in the given
  34. `encoding` (which defaults to 'utf-8').
  35. If `keep` is passed (with a list of entity names) those entities will
  36. be kept (they won't be removed).
  37. It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
  38. and named entities (such as ``&nbsp;`` or ``&gt;``).
  39. If `remove_illegal` is ``True``, entities that can't be converted are removed.
  40. If `remove_illegal` is ``False``, entities that can't be converted are kept "as
  41. is". For more information see the tests.
  42. Always returns a unicode string (with the entities removed).
  43. >>> import w3lib.html
  44. >>> w3lib.html.replace_entities(b'Price: &pound;100')
  45. u'Price: \\xa3100'
  46. >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
  47. Price: £100
  48. >>>
  49. """
  50. def convert_entity(m):
  51. groups = m.groupdict()
  52. if groups.get('dec'):
  53. number = int(groups['dec'], 10)
  54. elif groups.get('hex'):
  55. number = int(groups['hex'], 16)
  56. elif groups.get('named'):
  57. entity_name = groups['named']
  58. if entity_name.lower() in keep:
  59. return m.group(0)
  60. else:
  61. number = (moves.html_entities.name2codepoint.get(entity_name) or
  62. moves.html_entities.name2codepoint.get(entity_name.lower()))
  63. if number is not None:
  64. # Numeric character references in the 80-9F range are typically
  65. # interpreted by browsers as representing the characters mapped
  66. # to bytes 80-9F in the Windows-1252 encoding. For more info
  67. # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
  68. try:
  69. if 0x80 <= number <= 0x9f:
  70. return six.int2byte(number).decode('cp1252')
  71. else:
  72. return six.unichr(number)
  73. except ValueError:
  74. pass
  75. return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
  76. return _ent_re.sub(convert_entity, to_unicode(text, encoding))
  77. def has_entities(text, encoding=None):
  78. return bool(_ent_re.search(to_unicode(text, encoding)))
  79. def replace_tags(text, token='', encoding=None):
  80. """Replace all markup tags found in the given `text` by the given token.
  81. By default `token` is an empty string so it just removes all tags.
  82. `text` can be a unicode string or a regular string encoded as `encoding`
  83. (or ``'utf-8'`` if `encoding` is not given.)
  84. Always returns a unicode string.
  85. Examples:
  86. >>> import w3lib.html
  87. >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
  88. u'This text contains some tag'
  89. >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
  90. u' -- Je ne parle pas -- fran\\xe7ais -- -- '
  91. >>>
  92. """
  93. return _tag_re.sub(token, to_unicode(text, encoding))
  94. _REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)
  95. def remove_comments(text, encoding=None):
  96. """ Remove HTML Comments.
  97. >>> import w3lib.html
  98. >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
  99. u'test whatever'
  100. >>>
  101. """
  102. text = to_unicode(text, encoding)
  103. return _REMOVECOMMENTS_RE.sub(u'', text)
  104. def remove_tags(text, which_ones=(), keep=(), encoding=None):
  105. """ Remove HTML Tags only.
  106. `which_ones` and `keep` are both tuples, there are four cases:
  107. ============== ============= ==========================================
  108. ``which_ones`` ``keep`` what it does
  109. ============== ============= ==========================================
  110. **not empty** empty remove all tags in ``which_ones``
  111. empty **not empty** remove all tags except the ones in ``keep``
  112. empty empty remove all tags
  113. **not empty** **not empty** not allowed
  114. ============== ============= ==========================================
  115. Remove all tags:
  116. >>> import w3lib.html
  117. >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
  118. >>> w3lib.html.remove_tags(doc)
  119. u'This is a link: example'
  120. >>>
  121. Keep only some tags:
  122. >>> w3lib.html.remove_tags(doc, keep=('div',))
  123. u'<div>This is a link: example</div>'
  124. >>>
  125. Remove only specific tags:
  126. >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
  127. u'<div><p>This is a link: example</p></div>'
  128. >>>
  129. You can't remove some and keep some:
  130. >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
  131. Traceback (most recent call last):
  132. File "<stdin>", line 1, in <module>
  133. File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
  134. assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
  135. AssertionError: which_ones and keep can not be given at the same time
  136. >>>
  137. """
  138. assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
  139. which_ones = {tag.lower() for tag in which_ones}
  140. keep = {tag.lower() for tag in keep}
  141. def will_remove(tag):
  142. tag = tag.lower()
  143. if which_ones:
  144. return tag in which_ones
  145. else:
  146. return tag not in keep
  147. def remove_tag(m):
  148. tag = m.group(1)
  149. return u'' if will_remove(tag) else m.group(0)
  150. regex = '</?([^ >/]+).*?>'
  151. retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
  152. return retags.sub(remove_tag, to_unicode(text, encoding))
  153. def remove_tags_with_content(text, which_ones=(), encoding=None):
  154. """Remove tags and their content.
  155. `which_ones` is a tuple of which tags to remove including their content.
  156. If is empty, returns the string unmodified.
  157. >>> import w3lib.html
  158. >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
  159. >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
  160. u'<div><p> <a href="http://www.example.com">example</a></p></div>'
  161. >>>
  162. """
  163. text = to_unicode(text, encoding)
  164. if which_ones:
  165. tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
  166. retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
  167. text = retags.sub(u'', text)
  168. return text
  169. def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
  170. encoding=None):
  171. """Remove escape characters.
  172. `which_ones` is a tuple of which escape characters we want to remove.
  173. By default removes ``\\n``, ``\\t``, ``\\r``.
  174. `replace_by` is the string to replace the escape characters by.
  175. It defaults to ``''``, meaning the escape characters are removed.
  176. """
  177. text = to_unicode(text, encoding)
  178. for ec in which_ones:
  179. text = text.replace(ec, to_unicode(replace_by, encoding))
  180. return text
  181. def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
  182. """
  183. This function receives markup as a text (always a unicode string or
  184. a UTF-8 encoded string) and does the following:
  185. 1. removes entities (except the ones in `keep`) from any part of it
  186. that is not inside a CDATA
  187. 2. searches for CDATAs and extracts their text (if any) without modifying it.
  188. 3. removes the found CDATAs
  189. """
  190. def _get_fragments(txt, pattern):
  191. offset = 0
  192. for match in pattern.finditer(txt):
  193. match_s, match_e = match.span(1)
  194. yield txt[offset:match_s]
  195. yield match
  196. offset = match_e
  197. yield txt[offset:]
  198. text = to_unicode(text, encoding)
  199. ret_text = u''
  200. for fragment in _get_fragments(text, _cdata_re):
  201. if isinstance(fragment, six.string_types):
  202. # it's not a CDATA (so we try to remove its entities)
  203. ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
  204. else:
  205. # it's a CDATA (so we just extract its content)
  206. ret_text += fragment.group('cdata_d')
  207. return ret_text
  208. def get_base_url(text, baseurl='', encoding='utf-8'):
  209. """Return the base url if declared in the given HTML `text`,
  210. relative to the given base url.
  211. If no base url is found, the given `baseurl` is returned.
  212. """
  213. text = to_unicode(text, encoding)
  214. m = _baseurl_re.search(text)
  215. if m:
  216. return moves.urllib.parse.urljoin(
  217. safe_url_string(baseurl),
  218. safe_url_string(m.group(1), encoding=encoding)
  219. )
  220. else:
  221. return safe_url_string(baseurl)
  222. def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
  223. """Return the http-equiv parameter of the HTML meta element from the given
  224. HTML text and return a tuple ``(interval, url)`` where interval is an integer
  225. containing the delay in seconds (or zero if not present) and url is a
  226. string with the absolute url to redirect.
  227. If no meta redirect is found, ``(None, None)`` is returned.
  228. """
  229. if six.PY2:
  230. baseurl = to_bytes(baseurl, encoding)
  231. try:
  232. text = to_unicode(text, encoding)
  233. except UnicodeDecodeError:
  234. print(text)
  235. raise
  236. text = remove_tags_with_content(text, ignore_tags)
  237. text = remove_comments(replace_entities(text))
  238. m = _meta_refresh_re.search(text)
  239. if m:
  240. interval = float(m.group('int'))
  241. url = safe_url_string(m.group('url').strip(' "\''), encoding)
  242. url = moves.urllib.parse.urljoin(baseurl, url)
  243. return interval, url
  244. else:
  245. return None, None
  246. def strip_html5_whitespace(text):
  247. r"""
  248. Strip all leading and trailing space characters (as defined in
  249. https://www.w3.org/TR/html5/infrastructure.html#space-character).
  250. Such stripping is useful e.g. for processing HTML element attributes which
  251. contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
  252. defines them as "valid URL potentially surrounded by spaces"
  253. or "valid non-empty URL potentially surrounded by spaces".
  254. >>> strip_html5_whitespace(' hello\n')
  255. 'hello'
  256. """
  257. return text.strip(HTML5_WHITESPACE)