You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

542 lines
19 KiB

4 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import six
  4. from bleach import callbacks as linkify_callbacks
  5. from bleach import html5lib_shim
  6. from bleach.utils import alphabetize_attributes, force_unicode
  7. #: List of default callbacks
  8. DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
  9. TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
  10. ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
  11. cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
  12. dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
  13. gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
  14. im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
  15. kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
  16. ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
  17. net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
  18. pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
  19. sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
  20. tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
  21. xn xxx ye yt yu za zm zw""".split()
  22. # Make sure that .com doesn't get matched by .co first
  23. TLDS.reverse()
  24. def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
  25. """Builds the url regex used by linkifier
  26. If you want a different set of tlds or allowed protocols, pass those in
  27. and stomp on the existing ``url_re``::
  28. from bleach import linkifier
  29. my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
  30. linker = LinkifyFilter(url_re=my_url_re)
  31. """
  32. return re.compile(
  33. r"""\(* # Match any opening parentheses.
  34. \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
  35. ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
  36. (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
  37. # /path/zz (excluding "unsafe" chars from RFC 1738,
  38. # except for # and ~, which happen in practice)
  39. """.format('|'.join(protocols), '|'.join(tlds)),
  40. re.IGNORECASE | re.VERBOSE | re.UNICODE)
  41. URL_RE = build_url_re()
  42. PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
  43. EMAIL_RE = re.compile(
  44. r"""(?<!//)
  45. (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
  46. (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)* # dot-atom
  47. |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
  48. |\\[\001-\011\013\014\016-\177])*" # quoted-string
  49. )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}) # domain
  50. """,
  51. re.IGNORECASE | re.MULTILINE | re.VERBOSE)
  52. class Linker(object):
  53. """Convert URL-like strings in an HTML fragment to links
  54. This function converts strings that look like URLs, domain names and email
  55. addresses in text that may be an HTML fragment to links, while preserving:
  56. 1. links already in the string
  57. 2. urls found in attributes
  58. 3. email addresses
  59. linkify does a best-effort approach and tries to recover from bad
  60. situations due to crazy text.
  61. """
  62. def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
  63. url_re=URL_RE, email_re=EMAIL_RE):
  64. """Creates a Linker instance
  65. :arg list callbacks: list of callbacks to run when adjusting tag attributes;
  66. defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
  67. :arg list skip_tags: list of tags that you don't want to linkify the
  68. contents of; for example, you could set this to ``['pre']`` to skip
  69. linkifying contents of ``pre`` tags
  70. :arg bool parse_email: whether or not to linkify email addresses
  71. :arg re url_re: url matching regex
  72. :arg re email_re: email matching regex
  73. :returns: linkified text as unicode
  74. """
  75. self.callbacks = callbacks
  76. self.skip_tags = skip_tags
  77. self.parse_email = parse_email
  78. self.url_re = url_re
  79. self.email_re = email_re
  80. # Create a parser/tokenizer that allows all HTML tags and escapes
  81. # anything not in that list.
  82. self.parser = html5lib_shim.BleachHTMLParser(
  83. tags=html5lib_shim.HTML_TAGS,
  84. strip=False,
  85. consume_entities=True,
  86. namespaceHTMLElements=False,
  87. )
  88. self.walker = html5lib_shim.getTreeWalker('etree')
  89. self.serializer = html5lib_shim.BleachHTMLSerializer(
  90. quote_attr_values='always',
  91. omit_optional_tags=False,
  92. # linkify does not sanitize
  93. sanitize=False,
  94. # linkify alphabetizes
  95. alphabetical_attributes=False,
  96. )
  97. def linkify(self, text):
  98. """Linkify specified text
  99. :arg str text: the text to add links to
  100. :returns: linkified text as unicode
  101. :raises TypeError: if ``text`` is not a text type
  102. """
  103. if not isinstance(text, six.string_types):
  104. raise TypeError('argument must be of text type')
  105. text = force_unicode(text)
  106. if not text:
  107. return u''
  108. dom = self.parser.parseFragment(text)
  109. filtered = LinkifyFilter(
  110. source=self.walker(dom),
  111. callbacks=self.callbacks,
  112. skip_tags=self.skip_tags,
  113. parse_email=self.parse_email,
  114. url_re=self.url_re,
  115. email_re=self.email_re,
  116. )
  117. return self.serializer.render(filtered)
  118. class LinkifyFilter(html5lib_shim.Filter):
  119. """html5lib filter that linkifies text
  120. This will do the following:
  121. * convert email addresses into links
  122. * convert urls into links
  123. * edit existing links by running them through callbacks--the default is to
  124. add a ``rel="nofollow"``
  125. This filter can be used anywhere html5lib filters can be used.
  126. """
  127. def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
  128. url_re=URL_RE, email_re=EMAIL_RE):
  129. """Creates a LinkifyFilter instance
  130. :arg TreeWalker source: stream
  131. :arg list callbacks: list of callbacks to run when adjusting tag attributes;
  132. defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
  133. :arg list skip_tags: list of tags that you don't want to linkify the
  134. contents of; for example, you could set this to ``['pre']`` to skip
  135. linkifying contents of ``pre`` tags
  136. :arg bool parse_email: whether or not to linkify email addresses
  137. :arg re url_re: url matching regex
  138. :arg re email_re: email matching regex
  139. """
  140. super(LinkifyFilter, self).__init__(source)
  141. self.callbacks = callbacks or []
  142. self.skip_tags = skip_tags or []
  143. self.parse_email = parse_email
  144. self.url_re = url_re
  145. self.email_re = email_re
  146. def apply_callbacks(self, attrs, is_new):
  147. """Given an attrs dict and an is_new bool, runs through callbacks
  148. Callbacks can return an adjusted attrs dict or ``None``. In the case of
  149. ``None``, we stop going through callbacks and return that and the link
  150. gets dropped.
  151. :arg dict attrs: map of ``(namespace, name)`` -> ``value``
  152. :arg bool is_new: whether or not this link was added by linkify
  153. :returns: adjusted attrs dict or ``None``
  154. """
  155. for cb in self.callbacks:
  156. attrs = cb(attrs, is_new)
  157. if attrs is None:
  158. return None
  159. return attrs
  160. def extract_character_data(self, token_list):
  161. """Extracts and squashes character sequences in a token stream"""
  162. # FIXME(willkg): This is a terrible idea. What it does is drop all the
  163. # tags from the token list and merge the Characters and SpaceCharacters
  164. # tokens into a single text.
  165. #
  166. # So something like this::
  167. #
  168. # "<span>" "<b>" "some text" "</b>" "</span>"
  169. #
  170. # gets converted to "some text".
  171. #
  172. # This gets used to figure out the ``_text`` fauxttribute value for
  173. # linkify callables.
  174. #
  175. # I'm not really sure how else to support that ``_text`` fauxttribute and
  176. # maintain some modicum of backwards compatability with previous versions
  177. # of Bleach.
  178. out = []
  179. for token in token_list:
  180. token_type = token['type']
  181. if token_type in ['Characters', 'SpaceCharacters']:
  182. out.append(token['data'])
  183. return u''.join(out)
  184. def handle_email_addresses(self, src_iter):
  185. """Handle email addresses in character tokens"""
  186. for token in src_iter:
  187. if token['type'] == 'Characters':
  188. text = token['data']
  189. new_tokens = []
  190. end = 0
  191. # For each email address we find in the text
  192. for match in self.email_re.finditer(text):
  193. if match.start() > end:
  194. new_tokens.append(
  195. {u'type': u'Characters', u'data': text[end:match.start()]}
  196. )
  197. # Run attributes through the callbacks to see what we
  198. # should do with this match
  199. attrs = {
  200. (None, u'href'): u'mailto:%s' % match.group(0),
  201. u'_text': match.group(0)
  202. }
  203. attrs = self.apply_callbacks(attrs, True)
  204. if attrs is None:
  205. # Just add the text--but not as a link
  206. new_tokens.append(
  207. {u'type': u'Characters', u'data': match.group(0)}
  208. )
  209. else:
  210. # Add an "a" tag for the new link
  211. _text = attrs.pop(u'_text', '')
  212. attrs = alphabetize_attributes(attrs)
  213. new_tokens.extend([
  214. {u'type': u'StartTag', u'name': u'a', u'data': attrs},
  215. {u'type': u'Characters', u'data': force_unicode(_text)},
  216. {u'type': u'EndTag', u'name': 'a'}
  217. ])
  218. end = match.end()
  219. if new_tokens:
  220. # Yield the adjusted set of tokens and then continue
  221. # through the loop
  222. if end < len(text):
  223. new_tokens.append({u'type': u'Characters', u'data': text[end:]})
  224. for new_token in new_tokens:
  225. yield new_token
  226. continue
  227. yield token
  228. def strip_non_url_bits(self, fragment):
  229. """Strips non-url bits from the url
  230. This accounts for over-eager matching by the regex.
  231. """
  232. prefix = suffix = ''
  233. while fragment:
  234. # Try removing ( from the beginning and, if it's balanced, from the
  235. # end, too
  236. if fragment.startswith(u'('):
  237. prefix = prefix + u'('
  238. fragment = fragment[1:]
  239. if fragment.endswith(u')'):
  240. suffix = u')' + suffix
  241. fragment = fragment[:-1]
  242. continue
  243. # Now try extraneous things from the end. For example, sometimes we
  244. # pick up ) at the end of a url, but the url is in a parenthesized
  245. # phrase like:
  246. #
  247. # "i looked at the site (at http://example.com)"
  248. if fragment.endswith(u')') and u'(' not in fragment:
  249. fragment = fragment[:-1]
  250. suffix = u')' + suffix
  251. continue
  252. # Handle commas
  253. if fragment.endswith(u','):
  254. fragment = fragment[:-1]
  255. suffix = u',' + suffix
  256. continue
  257. # Handle periods
  258. if fragment.endswith(u'.'):
  259. fragment = fragment[:-1]
  260. suffix = u'.' + suffix
  261. continue
  262. # Nothing matched, so we're done
  263. break
  264. return fragment, prefix, suffix
  265. def handle_links(self, src_iter):
  266. """Handle links in character tokens"""
  267. in_a = False # happens, if parse_email=True and if a mail was found
  268. for token in src_iter:
  269. if in_a:
  270. if token['type'] == 'EndTag' and token['name'] == 'a':
  271. in_a = False
  272. yield token
  273. continue
  274. elif token['type'] == 'StartTag' and token['name'] == 'a':
  275. in_a = True
  276. yield token
  277. continue
  278. if token['type'] == 'Characters':
  279. text = token['data']
  280. new_tokens = []
  281. end = 0
  282. for match in self.url_re.finditer(text):
  283. if match.start() > end:
  284. new_tokens.append(
  285. {u'type': u'Characters', u'data': text[end:match.start()]}
  286. )
  287. url = match.group(0)
  288. prefix = suffix = ''
  289. # Sometimes we pick up too much in the url match, so look for
  290. # bits we should drop and remove them from the match
  291. url, prefix, suffix = self.strip_non_url_bits(url)
  292. # If there's no protocol, add one
  293. if PROTO_RE.search(url):
  294. href = url
  295. else:
  296. href = u'http://%s' % url
  297. attrs = {
  298. (None, u'href'): href,
  299. u'_text': url
  300. }
  301. attrs = self.apply_callbacks(attrs, True)
  302. if attrs is None:
  303. # Just add the text
  304. new_tokens.append(
  305. {u'type': u'Characters', u'data': prefix + url + suffix}
  306. )
  307. else:
  308. # Add the "a" tag!
  309. if prefix:
  310. new_tokens.append(
  311. {u'type': u'Characters', u'data': prefix}
  312. )
  313. _text = attrs.pop(u'_text', '')
  314. attrs = alphabetize_attributes(attrs)
  315. new_tokens.extend([
  316. {u'type': u'StartTag', u'name': u'a', u'data': attrs},
  317. {u'type': u'Characters', u'data': force_unicode(_text)},
  318. {u'type': u'EndTag', u'name': 'a'},
  319. ])
  320. if suffix:
  321. new_tokens.append(
  322. {u'type': u'Characters', u'data': suffix}
  323. )
  324. end = match.end()
  325. if new_tokens:
  326. # Yield the adjusted set of tokens and then continue
  327. # through the loop
  328. if end < len(text):
  329. new_tokens.append({u'type': u'Characters', u'data': text[end:]})
  330. for new_token in new_tokens:
  331. yield new_token
  332. continue
  333. yield token
  334. def handle_a_tag(self, token_buffer):
  335. """Handle the "a" tag
  336. This could adjust the link or drop it altogether depending on what the
  337. callbacks return.
  338. This yields the new set of tokens.
  339. """
  340. a_token = token_buffer[0]
  341. if a_token['data']:
  342. attrs = a_token['data']
  343. else:
  344. attrs = {}
  345. text = self.extract_character_data(token_buffer)
  346. attrs['_text'] = text
  347. attrs = self.apply_callbacks(attrs, False)
  348. if attrs is None:
  349. # We're dropping the "a" tag and everything else and replacing
  350. # it with character data. So emit that token.
  351. yield {'type': 'Characters', 'data': text}
  352. else:
  353. new_text = attrs.pop('_text', '')
  354. a_token['data'] = alphabetize_attributes(attrs)
  355. if text == new_text:
  356. # The callbacks didn't change the text, so we yield the new "a"
  357. # token, then whatever else was there, then the end "a" token
  358. yield a_token
  359. for mem in token_buffer[1:]:
  360. yield mem
  361. else:
  362. # If the callbacks changed the text, then we're going to drop
  363. # all the tokens between the start and end "a" tags and replace
  364. # it with the new text
  365. yield a_token
  366. yield {'type': 'Characters', 'data': force_unicode(new_text)}
  367. yield token_buffer[-1]
  368. def __iter__(self):
  369. in_a = False
  370. in_skip_tag = None
  371. token_buffer = []
  372. for token in super(LinkifyFilter, self).__iter__():
  373. if in_a:
  374. # Handle the case where we're in an "a" tag--we want to buffer tokens
  375. # until we hit an end "a" tag.
  376. if token['type'] == 'EndTag' and token['name'] == 'a':
  377. # Add the end tag to the token buffer and then handle them
  378. # and yield anything returned
  379. token_buffer.append(token)
  380. for new_token in self.handle_a_tag(token_buffer):
  381. yield new_token
  382. # Clear "a" related state and continue since we've yielded all
  383. # the tokens we're going to yield
  384. in_a = False
  385. token_buffer = []
  386. else:
  387. token_buffer.append(token)
  388. continue
  389. if token['type'] in ['StartTag', 'EmptyTag']:
  390. if token['name'] in self.skip_tags:
  391. # Skip tags start a "special mode" where we don't linkify
  392. # anything until the end tag.
  393. in_skip_tag = token['name']
  394. elif token['name'] == 'a':
  395. # The "a" tag is special--we switch to a slurp mode and
  396. # slurp all the tokens until the end "a" tag and then
  397. # figure out what to do with them there.
  398. in_a = True
  399. token_buffer.append(token)
  400. # We buffer the start tag, so we don't want to yield it,
  401. # yet
  402. continue
  403. elif in_skip_tag and self.skip_tags:
  404. # NOTE(willkg): We put this clause here since in_a and
  405. # switching in and out of in_a takes precedence.
  406. if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
  407. in_skip_tag = None
  408. elif not in_a and not in_skip_tag and token['type'] == 'Characters':
  409. new_stream = iter([token])
  410. if self.parse_email:
  411. new_stream = self.handle_email_addresses(new_stream)
  412. new_stream = self.handle_links(new_stream)
  413. for token in new_stream:
  414. yield token
  415. # We've already yielded this token, so continue
  416. continue
  417. yield token