You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1184 lines
36 KiB

4 years ago
  1. # coding: utf-8
  2. """
  3. mistune
  4. ~~~~~~~
  5. The fastest markdown parser in pure Python with renderer feature.
  6. :copyright: (c) 2014 - 2018 by Hsiaoming Yang.
  7. """
  8. import re
  9. import inspect
  10. __version__ = '0.8.4'
  11. __author__ = 'Hsiaoming Yang <me@lepture.com>'
  12. __all__ = [
  13. 'BlockGrammar', 'BlockLexer',
  14. 'InlineGrammar', 'InlineLexer',
  15. 'Renderer', 'Markdown',
  16. 'markdown', 'escape',
  17. ]
  18. _key_pattern = re.compile(r'\s+')
  19. _nonalpha_pattern = re.compile(r'\W')
  20. _escape_pattern = re.compile(r'&(?!#?\w+;)')
  21. _newline_pattern = re.compile(r'\r\n|\r')
  22. _block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M)
  23. _block_code_leading_pattern = re.compile(r'^ {4}', re.M)
  24. _inline_tags = [
  25. 'a', 'em', 'strong', 'small', 's', 'cite', 'q', 'dfn', 'abbr', 'data',
  26. 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark',
  27. 'ruby', 'rt', 'rp', 'bdi', 'bdo', 'span', 'br', 'wbr', 'ins', 'del',
  28. 'img', 'font',
  29. ]
  30. _pre_tags = ['pre', 'script', 'style']
  31. _valid_end = r'(?!:/|[^\w\s@]*@)\b'
  32. _valid_attr = r'''\s*[a-zA-Z\-](?:\s*\=\s*(?:"[^"]*"|'[^']*'|[^\s'">]+))?'''
  33. _block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end)
  34. _scheme_blacklist = ('javascript:', 'vbscript:')
  35. def _pure_pattern(regex):
  36. pattern = regex.pattern
  37. if pattern.startswith('^'):
  38. pattern = pattern[1:]
  39. return pattern
  40. def _keyify(key):
  41. key = escape(key.lower(), quote=True)
  42. return _key_pattern.sub(' ', key)
  43. def escape(text, quote=False, smart_amp=True):
  44. """Replace special characters "&", "<" and ">" to HTML-safe sequences.
  45. The original cgi.escape will always escape "&", but you can control
  46. this one for a smart escape amp.
  47. :param quote: if set to True, " and ' will be escaped.
  48. :param smart_amp: if set to False, & will always be escaped.
  49. """
  50. if smart_amp:
  51. text = _escape_pattern.sub('&amp;', text)
  52. else:
  53. text = text.replace('&', '&amp;')
  54. text = text.replace('<', '&lt;')
  55. text = text.replace('>', '&gt;')
  56. if quote:
  57. text = text.replace('"', '&quot;')
  58. text = text.replace("'", '&#39;')
  59. return text
  60. def escape_link(url):
  61. """Remove dangerous URL schemes like javascript: and escape afterwards."""
  62. lower_url = url.lower().strip('\x00\x1a \n\r\t')
  63. for scheme in _scheme_blacklist:
  64. if re.sub(r'[^A-Za-z0-9\/:]+', '', lower_url).startswith(scheme):
  65. return ''
  66. return escape(url, quote=True, smart_amp=False)
  67. def preprocessing(text, tab=4):
  68. text = _newline_pattern.sub('\n', text)
  69. text = text.expandtabs(tab)
  70. text = text.replace('\u2424', '\n')
  71. pattern = re.compile(r'^ +$', re.M)
  72. return pattern.sub('', text)
  73. class BlockGrammar(object):
  74. """Grammars for block level tokens."""
  75. def_links = re.compile(
  76. r'^ *\[([^^\]]+)\]: *' # [key]:
  77. r'<?([^\s>]+)>?' # <link> or link
  78. r'(?: +["(]([^\n]+)[")])? *(?:\n+|$)'
  79. )
  80. def_footnotes = re.compile(
  81. r'^\[\^([^\]]+)\]: *('
  82. r'[^\n]*(?:\n+|$)' # [^key]:
  83. r'(?: {1,}[^\n]*(?:\n+|$))*'
  84. r')'
  85. )
  86. newline = re.compile(r'^\n+')
  87. block_code = re.compile(r'^( {4}[^\n]+\n*)+')
  88. fences = re.compile(
  89. r'^ *(`{3,}|~{3,}) *([^`\s]+)? *\n' # ```lang
  90. r'([\s\S]+?)\s*'
  91. r'\1 *(?:\n+|$)' # ```
  92. )
  93. hrule = re.compile(r'^ {0,3}[-*_](?: *[-*_]){2,} *(?:\n+|$)')
  94. heading = re.compile(r'^ *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)')
  95. lheading = re.compile(r'^([^\n]+)\n *(=|-)+ *(?:\n+|$)')
  96. block_quote = re.compile(r'^( *>[^\n]+(\n[^\n]+)*\n*)+')
  97. list_block = re.compile(
  98. r'^( *)(?=[*+-]|\d+\.)(([*+-])?(?:\d+\.)?) [\s\S]+?'
  99. r'(?:'
  100. r'\n+(?=\1?(?:[-*_] *){3,}(?:\n+|$))' # hrule
  101. r'|\n+(?=%s)' # def links
  102. r'|\n+(?=%s)' # def footnotes\
  103. r'|\n+(?=\1(?(3)\d+\.|[*+-]) )' # heterogeneous bullet
  104. r'|\n{2,}'
  105. r'(?! )'
  106. r'(?!\1(?:[*+-]|\d+\.) )\n*'
  107. r'|'
  108. r'\s*$)' % (
  109. _pure_pattern(def_links),
  110. _pure_pattern(def_footnotes),
  111. )
  112. )
  113. list_item = re.compile(
  114. r'^(( *)(?:[*+-]|\d+\.) [^\n]*'
  115. r'(?:\n(?!\2(?:[*+-]|\d+\.) )[^\n]*)*)',
  116. flags=re.M
  117. )
  118. list_bullet = re.compile(r'^ *(?:[*+-]|\d+\.) +')
  119. paragraph = re.compile(
  120. r'^((?:[^\n]+\n?(?!'
  121. r'%s|%s|%s|%s|%s|%s|%s|%s|%s'
  122. r'))+)\n*' % (
  123. _pure_pattern(fences).replace(r'\1', r'\2'),
  124. _pure_pattern(list_block).replace(r'\1', r'\3'),
  125. _pure_pattern(hrule),
  126. _pure_pattern(heading),
  127. _pure_pattern(lheading),
  128. _pure_pattern(block_quote),
  129. _pure_pattern(def_links),
  130. _pure_pattern(def_footnotes),
  131. '<' + _block_tag,
  132. )
  133. )
  134. block_html = re.compile(
  135. r'^ *(?:%s|%s|%s) *(?:\n{2,}|\s*$)' % (
  136. r'<!--[\s\S]*?-->',
  137. r'<(%s)((?:%s)*?)>([\s\S]*?)<\/\1>' % (_block_tag, _valid_attr),
  138. r'<%s(?:%s)*?\s*\/?>' % (_block_tag, _valid_attr),
  139. )
  140. )
  141. table = re.compile(
  142. r'^ *\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*'
  143. )
  144. nptable = re.compile(
  145. r'^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*'
  146. )
  147. text = re.compile(r'^[^\n]+')
  148. class BlockLexer(object):
  149. """Block level lexer for block grammars."""
  150. grammar_class = BlockGrammar
  151. default_rules = [
  152. 'newline', 'hrule', 'block_code', 'fences', 'heading',
  153. 'nptable', 'lheading', 'block_quote',
  154. 'list_block', 'block_html', 'def_links',
  155. 'def_footnotes', 'table', 'paragraph', 'text'
  156. ]
  157. list_rules = (
  158. 'newline', 'block_code', 'fences', 'lheading', 'hrule',
  159. 'block_quote', 'list_block', 'block_html', 'text',
  160. )
  161. footnote_rules = (
  162. 'newline', 'block_code', 'fences', 'heading',
  163. 'nptable', 'lheading', 'hrule', 'block_quote',
  164. 'list_block', 'block_html', 'table', 'paragraph', 'text'
  165. )
  166. def __init__(self, rules=None, **kwargs):
  167. self.tokens = []
  168. self.def_links = {}
  169. self.def_footnotes = {}
  170. if not rules:
  171. rules = self.grammar_class()
  172. self.rules = rules
  173. self._max_recursive_depth = kwargs.get('max_recursive_depth', 6)
  174. self._list_depth = 0
  175. self._blockquote_depth = 0
  176. def __call__(self, text, rules=None):
  177. return self.parse(text, rules)
  178. def parse(self, text, rules=None):
  179. text = text.rstrip('\n')
  180. if not rules:
  181. rules = self.default_rules
  182. def manipulate(text):
  183. for key in rules:
  184. rule = getattr(self.rules, key)
  185. m = rule.match(text)
  186. if not m:
  187. continue
  188. getattr(self, 'parse_%s' % key)(m)
  189. return m
  190. return False # pragma: no cover
  191. while text:
  192. m = manipulate(text)
  193. if m is not False:
  194. text = text[len(m.group(0)):]
  195. continue
  196. if text: # pragma: no cover
  197. raise RuntimeError('Infinite loop at: %s' % text)
  198. return self.tokens
  199. def parse_newline(self, m):
  200. length = len(m.group(0))
  201. if length > 1:
  202. self.tokens.append({'type': 'newline'})
  203. def parse_block_code(self, m):
  204. # clean leading whitespace
  205. code = _block_code_leading_pattern.sub('', m.group(0))
  206. self.tokens.append({
  207. 'type': 'code',
  208. 'lang': None,
  209. 'text': code,
  210. })
  211. def parse_fences(self, m):
  212. self.tokens.append({
  213. 'type': 'code',
  214. 'lang': m.group(2),
  215. 'text': m.group(3),
  216. })
  217. def parse_heading(self, m):
  218. self.tokens.append({
  219. 'type': 'heading',
  220. 'level': len(m.group(1)),
  221. 'text': m.group(2),
  222. })
  223. def parse_lheading(self, m):
  224. """Parse setext heading."""
  225. self.tokens.append({
  226. 'type': 'heading',
  227. 'level': 1 if m.group(2) == '=' else 2,
  228. 'text': m.group(1),
  229. })
  230. def parse_hrule(self, m):
  231. self.tokens.append({'type': 'hrule'})
  232. def parse_list_block(self, m):
  233. bull = m.group(2)
  234. self.tokens.append({
  235. 'type': 'list_start',
  236. 'ordered': '.' in bull,
  237. })
  238. self._list_depth += 1
  239. if self._list_depth > self._max_recursive_depth:
  240. self.tokens.append({'type': 'list_item_start'})
  241. self.parse_text(m)
  242. self.tokens.append({'type': 'list_item_end'})
  243. else:
  244. cap = m.group(0)
  245. self._process_list_item(cap, bull)
  246. self.tokens.append({'type': 'list_end'})
  247. self._list_depth -= 1
  248. def _process_list_item(self, cap, bull):
  249. cap = self.rules.list_item.findall(cap)
  250. _next = False
  251. length = len(cap)
  252. for i in range(length):
  253. item = cap[i][0]
  254. # remove the bullet
  255. space = len(item)
  256. item = self.rules.list_bullet.sub('', item)
  257. # outdent
  258. if '\n ' in item:
  259. space = space - len(item)
  260. pattern = re.compile(r'^ {1,%d}' % space, flags=re.M)
  261. item = pattern.sub('', item)
  262. # determine whether item is loose or not
  263. loose = _next
  264. if not loose and re.search(r'\n\n(?!\s*$)', item):
  265. loose = True
  266. rest = len(item)
  267. if i != length - 1 and rest:
  268. _next = item[rest-1] == '\n'
  269. if not loose:
  270. loose = _next
  271. if loose:
  272. t = 'loose_item_start'
  273. else:
  274. t = 'list_item_start'
  275. self.tokens.append({'type': t})
  276. # recurse
  277. self.parse(item, self.list_rules)
  278. self.tokens.append({'type': 'list_item_end'})
  279. def parse_block_quote(self, m):
  280. self.tokens.append({'type': 'block_quote_start'})
  281. self._blockquote_depth += 1
  282. if self._blockquote_depth > self._max_recursive_depth:
  283. self.parse_text(m)
  284. else:
  285. # clean leading >
  286. cap = _block_quote_leading_pattern.sub('', m.group(0))
  287. self.parse(cap)
  288. self.tokens.append({'type': 'block_quote_end'})
  289. self._blockquote_depth -= 1
  290. def parse_def_links(self, m):
  291. key = _keyify(m.group(1))
  292. self.def_links[key] = {
  293. 'link': m.group(2),
  294. 'title': m.group(3),
  295. }
  296. def parse_def_footnotes(self, m):
  297. key = _keyify(m.group(1))
  298. if key in self.def_footnotes:
  299. # footnote is already defined
  300. return
  301. self.def_footnotes[key] = 0
  302. self.tokens.append({
  303. 'type': 'footnote_start',
  304. 'key': key,
  305. })
  306. text = m.group(2)
  307. if '\n' in text:
  308. lines = text.split('\n')
  309. whitespace = None
  310. for line in lines[1:]:
  311. space = len(line) - len(line.lstrip())
  312. if space and (not whitespace or space < whitespace):
  313. whitespace = space
  314. newlines = [lines[0]]
  315. for line in lines[1:]:
  316. newlines.append(line[whitespace:])
  317. text = '\n'.join(newlines)
  318. self.parse(text, self.footnote_rules)
  319. self.tokens.append({
  320. 'type': 'footnote_end',
  321. 'key': key,
  322. })
  323. def parse_table(self, m):
  324. item = self._process_table(m)
  325. cells = re.sub(r'(?: *\| *)?\n$', '', m.group(3))
  326. cells = cells.split('\n')
  327. for i, v in enumerate(cells):
  328. v = re.sub(r'^ *\| *| *\| *$', '', v)
  329. cells[i] = re.split(r' *(?<!\\)\| *', v)
  330. item['cells'] = self._process_cells(cells)
  331. self.tokens.append(item)
  332. def parse_nptable(self, m):
  333. item = self._process_table(m)
  334. cells = re.sub(r'\n$', '', m.group(3))
  335. cells = cells.split('\n')
  336. for i, v in enumerate(cells):
  337. cells[i] = re.split(r' *(?<!\\)\| *', v)
  338. item['cells'] = self._process_cells(cells)
  339. self.tokens.append(item)
  340. def _process_table(self, m):
  341. header = re.sub(r'^ *| *\| *$', '', m.group(1))
  342. header = re.split(r' *\| *', header)
  343. align = re.sub(r' *|\| *$', '', m.group(2))
  344. align = re.split(r' *\| *', align)
  345. for i, v in enumerate(align):
  346. if re.search(r'^ *-+: *$', v):
  347. align[i] = 'right'
  348. elif re.search(r'^ *:-+: *$', v):
  349. align[i] = 'center'
  350. elif re.search(r'^ *:-+ *$', v):
  351. align[i] = 'left'
  352. else:
  353. align[i] = None
  354. item = {
  355. 'type': 'table',
  356. 'header': header,
  357. 'align': align,
  358. }
  359. return item
  360. def _process_cells(self, cells):
  361. for i, line in enumerate(cells):
  362. for c, cell in enumerate(line):
  363. # de-escape any pipe inside the cell here
  364. cells[i][c] = re.sub('\\\\\|', '|', cell)
  365. return cells
  366. def parse_block_html(self, m):
  367. tag = m.group(1)
  368. if not tag:
  369. text = m.group(0)
  370. self.tokens.append({
  371. 'type': 'close_html',
  372. 'text': text
  373. })
  374. else:
  375. attr = m.group(2)
  376. text = m.group(3)
  377. self.tokens.append({
  378. 'type': 'open_html',
  379. 'tag': tag,
  380. 'extra': attr,
  381. 'text': text
  382. })
  383. def parse_paragraph(self, m):
  384. text = m.group(1).rstrip('\n')
  385. self.tokens.append({'type': 'paragraph', 'text': text})
  386. def parse_text(self, m):
  387. text = m.group(0)
  388. self.tokens.append({'type': 'text', 'text': text})
  389. class InlineGrammar(object):
  390. """Grammars for inline level tokens."""
  391. escape = re.compile(r'^\\([\\`*{}\[\]()#+\-.!_>~|])') # \* \+ \! ....
  392. inline_html = re.compile(
  393. r'^(?:%s|%s|%s)' % (
  394. r'<!--[\s\S]*?-->',
  395. r'<(\w+%s)((?:%s)*?)\s*>([\s\S]*?)<\/\1>' % (
  396. _valid_end, _valid_attr),
  397. r'<\w+%s(?:%s)*?\s*\/?>' % (_valid_end, _valid_attr),
  398. )
  399. )
  400. autolink = re.compile(r'^<([^ >]+(@|:)[^ >]+)>')
  401. link = re.compile(
  402. r'^!?\[('
  403. r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*'
  404. r')\]\('
  405. r'''\s*(<)?([\s\S]*?)(?(2)>)(?:\s+['"]([\s\S]*?)['"])?\s*'''
  406. r'\)'
  407. )
  408. reflink = re.compile(
  409. r'^!?\[('
  410. r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*'
  411. r')\]\s*\[([^^\]]*)\]'
  412. )
  413. nolink = re.compile(r'^!?\[((?:\[[^\]]*\]|[^\[\]])*)\]')
  414. url = re.compile(r'''^(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''')
  415. double_emphasis = re.compile(
  416. r'^_{2}([\s\S]+?)_{2}(?!_)' # __word__
  417. r'|'
  418. r'^\*{2}([\s\S]+?)\*{2}(?!\*)' # **word**
  419. )
  420. emphasis = re.compile(
  421. r'^\b_((?:__|[^_])+?)_\b' # _word_
  422. r'|'
  423. r'^\*((?:\*\*|[^\*])+?)\*(?!\*)' # *word*
  424. )
  425. code = re.compile(r'^(`+)\s*([\s\S]*?[^`])\s*\1(?!`)') # `code`
  426. linebreak = re.compile(r'^ {2,}\n(?!\s*$)')
  427. strikethrough = re.compile(r'^~~(?=\S)([\s\S]*?\S)~~') # ~~word~~
  428. footnote = re.compile(r'^\[\^([^\]]+)\]')
  429. text = re.compile(r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| {2,}\n|$)')
  430. def hard_wrap(self):
  431. """Grammar for hard wrap linebreak. You don't need to add two
  432. spaces at the end of a line.
  433. """
  434. self.linebreak = re.compile(r'^ *\n(?!\s*$)')
  435. self.text = re.compile(
  436. r'^[\s\S]+?(?=[\\<!\[_*`~]|https?://| *\n|$)'
  437. )
  438. class InlineLexer(object):
  439. """Inline level lexer for inline grammars."""
  440. grammar_class = InlineGrammar
  441. default_rules = [
  442. 'escape', 'inline_html', 'autolink', 'url',
  443. 'footnote', 'link', 'reflink', 'nolink',
  444. 'double_emphasis', 'emphasis', 'code',
  445. 'linebreak', 'strikethrough', 'text',
  446. ]
  447. inline_html_rules = [
  448. 'escape', 'inline_html', 'autolink', 'url', 'link', 'reflink',
  449. 'nolink', 'double_emphasis', 'emphasis', 'code',
  450. 'linebreak', 'strikethrough', 'text',
  451. ]
  452. def __init__(self, renderer, rules=None, **kwargs):
  453. self.renderer = renderer
  454. self.links = {}
  455. self.footnotes = {}
  456. self.footnote_index = 0
  457. if not rules:
  458. rules = self.grammar_class()
  459. kwargs.update(self.renderer.options)
  460. if kwargs.get('hard_wrap'):
  461. rules.hard_wrap()
  462. self.rules = rules
  463. self._in_link = False
  464. self._in_footnote = False
  465. self._parse_inline_html = kwargs.get('parse_inline_html')
  466. def __call__(self, text, rules=None):
  467. return self.output(text, rules)
  468. def setup(self, links, footnotes):
  469. self.footnote_index = 0
  470. self.links = links or {}
  471. self.footnotes = footnotes or {}
  472. def output(self, text, rules=None):
  473. text = text.rstrip('\n')
  474. if not rules:
  475. rules = list(self.default_rules)
  476. if self._in_footnote and 'footnote' in rules:
  477. rules.remove('footnote')
  478. output = self.renderer.placeholder()
  479. def manipulate(text):
  480. for key in rules:
  481. pattern = getattr(self.rules, key)
  482. m = pattern.match(text)
  483. if not m:
  484. continue
  485. self.line_match = m
  486. out = getattr(self, 'output_%s' % key)(m)
  487. if out is not None:
  488. return m, out
  489. return False # pragma: no cover
  490. while text:
  491. ret = manipulate(text)
  492. if ret is not False:
  493. m, out = ret
  494. output += out
  495. text = text[len(m.group(0)):]
  496. continue
  497. if text: # pragma: no cover
  498. raise RuntimeError('Infinite loop at: %s' % text)
  499. return output
  500. def output_escape(self, m):
  501. text = m.group(1)
  502. return self.renderer.escape(text)
  503. def output_autolink(self, m):
  504. link = m.group(1)
  505. if m.group(2) == '@':
  506. is_email = True
  507. else:
  508. is_email = False
  509. return self.renderer.autolink(link, is_email)
  510. def output_url(self, m):
  511. link = m.group(1)
  512. if self._in_link:
  513. return self.renderer.text(link)
  514. return self.renderer.autolink(link, False)
  515. def output_inline_html(self, m):
  516. tag = m.group(1)
  517. if self._parse_inline_html and tag in _inline_tags:
  518. text = m.group(3)
  519. if tag == 'a':
  520. self._in_link = True
  521. text = self.output(text, rules=self.inline_html_rules)
  522. self._in_link = False
  523. else:
  524. text = self.output(text, rules=self.inline_html_rules)
  525. extra = m.group(2) or ''
  526. html = '<%s%s>%s</%s>' % (tag, extra, text, tag)
  527. else:
  528. html = m.group(0)
  529. return self.renderer.inline_html(html)
  530. def output_footnote(self, m):
  531. key = _keyify(m.group(1))
  532. if key not in self.footnotes:
  533. return None
  534. if self.footnotes[key]:
  535. return None
  536. self.footnote_index += 1
  537. self.footnotes[key] = self.footnote_index
  538. return self.renderer.footnote_ref(key, self.footnote_index)
  539. def output_link(self, m):
  540. return self._process_link(m, m.group(3), m.group(4))
  541. def output_reflink(self, m):
  542. key = _keyify(m.group(2) or m.group(1))
  543. if key not in self.links:
  544. return None
  545. ret = self.links[key]
  546. return self._process_link(m, ret['link'], ret['title'])
  547. def output_nolink(self, m):
  548. key = _keyify(m.group(1))
  549. if key not in self.links:
  550. return None
  551. ret = self.links[key]
  552. return self._process_link(m, ret['link'], ret['title'])
  553. def _process_link(self, m, link, title=None):
  554. line = m.group(0)
  555. text = m.group(1)
  556. if line[0] == '!':
  557. return self.renderer.image(link, title, text)
  558. self._in_link = True
  559. text = self.output(text)
  560. self._in_link = False
  561. return self.renderer.link(link, title, text)
  562. def output_double_emphasis(self, m):
  563. text = m.group(2) or m.group(1)
  564. text = self.output(text)
  565. return self.renderer.double_emphasis(text)
  566. def output_emphasis(self, m):
  567. text = m.group(2) or m.group(1)
  568. text = self.output(text)
  569. return self.renderer.emphasis(text)
  570. def output_code(self, m):
  571. text = m.group(2)
  572. return self.renderer.codespan(text)
  573. def output_linebreak(self, m):
  574. return self.renderer.linebreak()
  575. def output_strikethrough(self, m):
  576. text = self.output(m.group(1))
  577. return self.renderer.strikethrough(text)
  578. def output_text(self, m):
  579. text = m.group(0)
  580. return self.renderer.text(text)
  581. class Renderer(object):
  582. """The default HTML renderer for rendering Markdown.
  583. """
  584. def __init__(self, **kwargs):
  585. self.options = kwargs
  586. def placeholder(self):
  587. """Returns the default, empty output value for the renderer.
  588. All renderer methods use the '+=' operator to append to this value.
  589. Default is a string so rendering HTML can build up a result string with
  590. the rendered Markdown.
  591. Can be overridden by Renderer subclasses to be types like an empty
  592. list, allowing the renderer to create a tree-like structure to
  593. represent the document (which can then be reprocessed later into a
  594. separate format like docx or pdf).
  595. """
  596. return ''
  597. def block_code(self, code, lang=None):
  598. """Rendering block level code. ``pre > code``.
  599. :param code: text content of the code block.
  600. :param lang: language of the given code.
  601. """
  602. code = code.rstrip('\n')
  603. if not lang:
  604. code = escape(code, smart_amp=False)
  605. return '<pre><code>%s\n</code></pre>\n' % code
  606. code = escape(code, quote=True, smart_amp=False)
  607. return '<pre><code class="lang-%s">%s\n</code></pre>\n' % (lang, code)
  608. def block_quote(self, text):
  609. """Rendering <blockquote> with the given text.
  610. :param text: text content of the blockquote.
  611. """
  612. return '<blockquote>%s\n</blockquote>\n' % text.rstrip('\n')
  613. def block_html(self, html):
  614. """Rendering block level pure html content.
  615. :param html: text content of the html snippet.
  616. """
  617. if self.options.get('skip_style') and \
  618. html.lower().startswith('<style'):
  619. return ''
  620. if self.options.get('escape'):
  621. return escape(html)
  622. return html
  623. def header(self, text, level, raw=None):
  624. """Rendering header/heading tags like ``<h1>`` ``<h2>``.
  625. :param text: rendered text content for the header.
  626. :param level: a number for the header level, for example: 1.
  627. :param raw: raw text content of the header.
  628. """
  629. return '<h%d>%s</h%d>\n' % (level, text, level)
  630. def hrule(self):
  631. """Rendering method for ``<hr>`` tag."""
  632. if self.options.get('use_xhtml'):
  633. return '<hr />\n'
  634. return '<hr>\n'
  635. def list(self, body, ordered=True):
  636. """Rendering list tags like ``<ul>`` and ``<ol>``.
  637. :param body: body contents of the list.
  638. :param ordered: whether this list is ordered or not.
  639. """
  640. tag = 'ul'
  641. if ordered:
  642. tag = 'ol'
  643. return '<%s>\n%s</%s>\n' % (tag, body, tag)
  644. def list_item(self, text):
  645. """Rendering list item snippet. Like ``<li>``."""
  646. return '<li>%s</li>\n' % text
  647. def paragraph(self, text):
  648. """Rendering paragraph tags. Like ``<p>``."""
  649. return '<p>%s</p>\n' % text.strip(' ')
  650. def table(self, header, body):
  651. """Rendering table element. Wrap header and body in it.
  652. :param header: header part of the table.
  653. :param body: body part of the table.
  654. """
  655. return (
  656. '<table>\n<thead>%s</thead>\n'
  657. '<tbody>\n%s</tbody>\n</table>\n'
  658. ) % (header, body)
  659. def table_row(self, content):
  660. """Rendering a table row. Like ``<tr>``.
  661. :param content: content of current table row.
  662. """
  663. return '<tr>\n%s</tr>\n' % content
  664. def table_cell(self, content, **flags):
  665. """Rendering a table cell. Like ``<th>`` ``<td>``.
  666. :param content: content of current table cell.
  667. :param header: whether this is header or not.
  668. :param align: align of current table cell.
  669. """
  670. if flags['header']:
  671. tag = 'th'
  672. else:
  673. tag = 'td'
  674. align = flags['align']
  675. if not align:
  676. return '<%s>%s</%s>\n' % (tag, content, tag)
  677. return '<%s style="text-align:%s">%s</%s>\n' % (
  678. tag, align, content, tag
  679. )
  680. def double_emphasis(self, text):
  681. """Rendering **strong** text.
  682. :param text: text content for emphasis.
  683. """
  684. return '<strong>%s</strong>' % text
  685. def emphasis(self, text):
  686. """Rendering *emphasis* text.
  687. :param text: text content for emphasis.
  688. """
  689. return '<em>%s</em>' % text
  690. def codespan(self, text):
  691. """Rendering inline `code` text.
  692. :param text: text content for inline code.
  693. """
  694. text = escape(text.rstrip(), smart_amp=False)
  695. return '<code>%s</code>' % text
  696. def linebreak(self):
  697. """Rendering line break like ``<br>``."""
  698. if self.options.get('use_xhtml'):
  699. return '<br />\n'
  700. return '<br>\n'
  701. def strikethrough(self, text):
  702. """Rendering ~~strikethrough~~ text.
  703. :param text: text content for strikethrough.
  704. """
  705. return '<del>%s</del>' % text
  706. def text(self, text):
  707. """Rendering unformatted text.
  708. :param text: text content.
  709. """
  710. if self.options.get('parse_block_html'):
  711. return text
  712. return escape(text)
  713. def escape(self, text):
  714. """Rendering escape sequence.
  715. :param text: text content.
  716. """
  717. return escape(text)
  718. def autolink(self, link, is_email=False):
  719. """Rendering a given link or email address.
  720. :param link: link content or email address.
  721. :param is_email: whether this is an email or not.
  722. """
  723. text = link = escape_link(link)
  724. if is_email:
  725. link = 'mailto:%s' % link
  726. return '<a href="%s">%s</a>' % (link, text)
  727. def link(self, link, title, text):
  728. """Rendering a given link with content and title.
  729. :param link: href link for ``<a>`` tag.
  730. :param title: title content for `title` attribute.
  731. :param text: text content for description.
  732. """
  733. link = escape_link(link)
  734. if not title:
  735. return '<a href="%s">%s</a>' % (link, text)
  736. title = escape(title, quote=True)
  737. return '<a href="%s" title="%s">%s</a>' % (link, title, text)
  738. def image(self, src, title, text):
  739. """Rendering a image with title and text.
  740. :param src: source link of the image.
  741. :param title: title text of the image.
  742. :param text: alt text of the image.
  743. """
  744. src = escape_link(src)
  745. text = escape(text, quote=True)
  746. if title:
  747. title = escape(title, quote=True)
  748. html = '<img src="%s" alt="%s" title="%s"' % (src, text, title)
  749. else:
  750. html = '<img src="%s" alt="%s"' % (src, text)
  751. if self.options.get('use_xhtml'):
  752. return '%s />' % html
  753. return '%s>' % html
  754. def inline_html(self, html):
  755. """Rendering span level pure html content.
  756. :param html: text content of the html snippet.
  757. """
  758. if self.options.get('escape'):
  759. return escape(html)
  760. return html
  761. def newline(self):
  762. """Rendering newline element."""
  763. return ''
  764. def footnote_ref(self, key, index):
  765. """Rendering the ref anchor of a footnote.
  766. :param key: identity key for the footnote.
  767. :param index: the index count of current footnote.
  768. """
  769. html = (
  770. '<sup class="footnote-ref" id="fnref-%s">'
  771. '<a href="#fn-%s">%d</a></sup>'
  772. ) % (escape(key), escape(key), index)
  773. return html
  774. def footnote_item(self, key, text):
  775. """Rendering a footnote item.
  776. :param key: identity key for the footnote.
  777. :param text: text content of the footnote.
  778. """
  779. back = (
  780. '<a href="#fnref-%s" class="footnote">&#8617;</a>'
  781. ) % escape(key)
  782. text = text.rstrip()
  783. if text.endswith('</p>'):
  784. text = re.sub(r'<\/p>$', r'%s</p>' % back, text)
  785. else:
  786. text = '%s<p>%s</p>' % (text, back)
  787. html = '<li id="fn-%s">%s</li>\n' % (escape(key), text)
  788. return html
  789. def footnotes(self, text):
  790. """Wrapper for all footnotes.
  791. :param text: contents of all footnotes.
  792. """
  793. html = '<div class="footnotes">\n%s<ol>%s</ol>\n</div>\n'
  794. return html % (self.hrule(), text)
  795. class Markdown(object):
  796. """The Markdown parser.
  797. :param renderer: An instance of ``Renderer``.
  798. :param inline: An inline lexer class or instance.
  799. :param block: A block lexer class or instance.
  800. """
  801. def __init__(self, renderer=None, inline=None, block=None, **kwargs):
  802. if not renderer:
  803. renderer = Renderer(**kwargs)
  804. else:
  805. kwargs.update(renderer.options)
  806. self.renderer = renderer
  807. if inline and inspect.isclass(inline):
  808. inline = inline(renderer, **kwargs)
  809. if block and inspect.isclass(block):
  810. block = block(**kwargs)
  811. if inline:
  812. self.inline = inline
  813. else:
  814. self.inline = InlineLexer(renderer, **kwargs)
  815. self.block = block or BlockLexer(BlockGrammar())
  816. self.footnotes = []
  817. self.tokens = []
  818. # detect if it should parse text in block html
  819. self._parse_block_html = kwargs.get('parse_block_html')
  820. def __call__(self, text):
  821. return self.parse(text)
  822. def render(self, text):
  823. """Render the Markdown text.
  824. :param text: markdown formatted text content.
  825. """
  826. return self.parse(text)
  827. def parse(self, text):
  828. out = self.output(preprocessing(text))
  829. keys = self.block.def_footnotes
  830. # reset block
  831. self.block.def_links = {}
  832. self.block.def_footnotes = {}
  833. # reset inline
  834. self.inline.links = {}
  835. self.inline.footnotes = {}
  836. if not self.footnotes:
  837. return out
  838. footnotes = filter(lambda o: keys.get(o['key']), self.footnotes)
  839. self.footnotes = sorted(
  840. footnotes, key=lambda o: keys.get(o['key']), reverse=True
  841. )
  842. body = self.renderer.placeholder()
  843. while self.footnotes:
  844. note = self.footnotes.pop()
  845. body += self.renderer.footnote_item(
  846. note['key'], note['text']
  847. )
  848. out += self.renderer.footnotes(body)
  849. return out
  850. def pop(self):
  851. if not self.tokens:
  852. return None
  853. self.token = self.tokens.pop()
  854. return self.token
  855. def peek(self):
  856. if self.tokens:
  857. return self.tokens[-1]
  858. return None # pragma: no cover
  859. def output(self, text, rules=None):
  860. self.tokens = self.block(text, rules)
  861. self.tokens.reverse()
  862. self.inline.setup(self.block.def_links, self.block.def_footnotes)
  863. out = self.renderer.placeholder()
  864. while self.pop():
  865. out += self.tok()
  866. return out
  867. def tok(self):
  868. t = self.token['type']
  869. # sepcial cases
  870. if t.endswith('_start'):
  871. t = t[:-6]
  872. return getattr(self, 'output_%s' % t)()
  873. def tok_text(self):
  874. text = self.token['text']
  875. while self.peek()['type'] == 'text':
  876. text += '\n' + self.pop()['text']
  877. return self.inline(text)
  878. def output_newline(self):
  879. return self.renderer.newline()
  880. def output_hrule(self):
  881. return self.renderer.hrule()
  882. def output_heading(self):
  883. return self.renderer.header(
  884. self.inline(self.token['text']),
  885. self.token['level'],
  886. self.token['text'],
  887. )
  888. def output_code(self):
  889. return self.renderer.block_code(
  890. self.token['text'], self.token['lang']
  891. )
  892. def output_table(self):
  893. aligns = self.token['align']
  894. aligns_length = len(aligns)
  895. cell = self.renderer.placeholder()
  896. # header part
  897. header = self.renderer.placeholder()
  898. for i, value in enumerate(self.token['header']):
  899. align = aligns[i] if i < aligns_length else None
  900. flags = {'header': True, 'align': align}
  901. cell += self.renderer.table_cell(self.inline(value), **flags)
  902. header += self.renderer.table_row(cell)
  903. # body part
  904. body = self.renderer.placeholder()
  905. for i, row in enumerate(self.token['cells']):
  906. cell = self.renderer.placeholder()
  907. for j, value in enumerate(row):
  908. align = aligns[j] if j < aligns_length else None
  909. flags = {'header': False, 'align': align}
  910. cell += self.renderer.table_cell(self.inline(value), **flags)
  911. body += self.renderer.table_row(cell)
  912. return self.renderer.table(header, body)
  913. def output_block_quote(self):
  914. body = self.renderer.placeholder()
  915. while self.pop()['type'] != 'block_quote_end':
  916. body += self.tok()
  917. return self.renderer.block_quote(body)
  918. def output_list(self):
  919. ordered = self.token['ordered']
  920. body = self.renderer.placeholder()
  921. while self.pop()['type'] != 'list_end':
  922. body += self.tok()
  923. return self.renderer.list(body, ordered)
  924. def output_list_item(self):
  925. body = self.renderer.placeholder()
  926. while self.pop()['type'] != 'list_item_end':
  927. if self.token['type'] == 'text':
  928. body += self.tok_text()
  929. else:
  930. body += self.tok()
  931. return self.renderer.list_item(body)
  932. def output_loose_item(self):
  933. body = self.renderer.placeholder()
  934. while self.pop()['type'] != 'list_item_end':
  935. body += self.tok()
  936. return self.renderer.list_item(body)
  937. def output_footnote(self):
  938. self.inline._in_footnote = True
  939. body = self.renderer.placeholder()
  940. key = self.token['key']
  941. while self.pop()['type'] != 'footnote_end':
  942. body += self.tok()
  943. self.footnotes.append({'key': key, 'text': body})
  944. self.inline._in_footnote = False
  945. return self.renderer.placeholder()
  946. def output_close_html(self):
  947. text = self.token['text']
  948. return self.renderer.block_html(text)
  949. def output_open_html(self):
  950. text = self.token['text']
  951. tag = self.token['tag']
  952. if self._parse_block_html and tag not in _pre_tags:
  953. text = self.inline(text, rules=self.inline.inline_html_rules)
  954. extra = self.token.get('extra') or ''
  955. html = '<%s%s>%s</%s>' % (tag, extra, text, tag)
  956. return self.renderer.block_html(html)
  957. def output_paragraph(self):
  958. return self.renderer.paragraph(self.inline(self.token['text']))
  959. def output_text(self):
  960. return self.renderer.paragraph(self.tok_text())
  961. def markdown(text, escape=True, **kwargs):
  962. """Render markdown formatted text to html.
  963. :param text: markdown formatted text content.
  964. :param escape: if set to False, all html tags will not be escaped.
  965. :param use_xhtml: output with xhtml tags.
  966. :param hard_wrap: if set to True, it will use the GFM line breaks feature.
  967. :param parse_block_html: parse text only in block level html.
  968. :param parse_inline_html: parse text only in inline level html.
  969. """
  970. return Markdown(escape=escape, **kwargs)(text)