You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

877 lines
31 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexer
  4. ~~~~~~~~~~~~~~
  5. Base lexer classes.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. import sys
  11. import time
  12. from pygments.filter import apply_filters, Filter
  13. from pygments.filters import get_filter_by_name
  14. from pygments.token import Error, Text, Other, _TokenType
  15. from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  16. make_analysator, Future, guess_decode
  17. from pygments.regexopt import regex_opt
  18. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  19. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  20. 'default', 'words']
  21. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  22. (b'\xff\xfe\0\0', 'utf-32'),
  23. (b'\0\0\xfe\xff', 'utf-32be'),
  24. (b'\xff\xfe', 'utf-16'),
  25. (b'\xfe\xff', 'utf-16be')]
  26. _default_analyse = staticmethod(lambda x: 0.0)
  27. class LexerMeta(type):
  28. """
  29. This metaclass automagically converts ``analyse_text`` methods into
  30. static methods which always return float values.
  31. """
  32. def __new__(mcs, name, bases, d):
  33. if 'analyse_text' in d:
  34. d['analyse_text'] = make_analysator(d['analyse_text'])
  35. return type.__new__(mcs, name, bases, d)
  36. class Lexer(metaclass=LexerMeta):
  37. """
  38. Lexer for a specific language.
  39. Basic options recognized:
  40. ``stripnl``
  41. Strip leading and trailing newlines from the input (default: True).
  42. ``stripall``
  43. Strip all leading and trailing whitespace from the input
  44. (default: False).
  45. ``ensurenl``
  46. Make sure that the input ends with a newline (default: True). This
  47. is required for some lexers that consume input linewise.
  48. .. versionadded:: 1.3
  49. ``tabsize``
  50. If given and greater than 0, expand tabs in the input (default: 0).
  51. ``encoding``
  52. If given, must be an encoding name. This encoding will be used to
  53. convert the input string to Unicode, if it is not already a Unicode
  54. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  55. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  56. library, if it is installed.
  57. ``inencoding``
  58. Overrides the ``encoding`` if given.
  59. """
  60. #: Name of the lexer
  61. name = None
  62. #: Shortcuts for the lexer
  63. aliases = []
  64. #: File name globs
  65. filenames = []
  66. #: Secondary file name globs
  67. alias_filenames = []
  68. #: MIME types
  69. mimetypes = []
  70. #: Priority, should multiple lexers match and no content is provided
  71. priority = 0
  72. def __init__(self, **options):
  73. self.options = options
  74. self.stripnl = get_bool_opt(options, 'stripnl', True)
  75. self.stripall = get_bool_opt(options, 'stripall', False)
  76. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  77. self.tabsize = get_int_opt(options, 'tabsize', 0)
  78. self.encoding = options.get('encoding', 'guess')
  79. self.encoding = options.get('inencoding') or self.encoding
  80. self.filters = []
  81. for filter_ in get_list_opt(options, 'filters', ()):
  82. self.add_filter(filter_)
  83. def __repr__(self):
  84. if self.options:
  85. return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
  86. self.options)
  87. else:
  88. return '<pygments.lexers.%s>' % self.__class__.__name__
  89. def add_filter(self, filter_, **options):
  90. """
  91. Add a new stream filter to this lexer.
  92. """
  93. if not isinstance(filter_, Filter):
  94. filter_ = get_filter_by_name(filter_, **options)
  95. self.filters.append(filter_)
  96. def analyse_text(text):
  97. """
  98. Has to return a float between ``0`` and ``1`` that indicates
  99. if a lexer wants to highlight this text. Used by ``guess_lexer``.
  100. If this method returns ``0`` it won't highlight it in any case, if
  101. it returns ``1`` highlighting with this lexer is guaranteed.
  102. The `LexerMeta` metaclass automatically wraps this function so
  103. that it works like a static method (no ``self`` or ``cls``
  104. parameter) and the return value is automatically converted to
  105. `float`. If the return value is an object that is boolean `False`
  106. it's the same as if the return values was ``0.0``.
  107. """
  108. def get_tokens(self, text, unfiltered=False):
  109. """
  110. Return an iterable of (tokentype, value) pairs generated from
  111. `text`. If `unfiltered` is set to `True`, the filtering mechanism
  112. is bypassed even if filters are defined.
  113. Also preprocess the text, i.e. expand tabs and strip it if
  114. wanted and applies registered filters.
  115. """
  116. if not isinstance(text, str):
  117. if self.encoding == 'guess':
  118. text, _ = guess_decode(text)
  119. elif self.encoding == 'chardet':
  120. try:
  121. import chardet
  122. except ImportError:
  123. raise ImportError('To enable chardet encoding guessing, '
  124. 'please install the chardet library '
  125. 'from http://chardet.feedparser.org/')
  126. # check for BOM first
  127. decoded = None
  128. for bom, encoding in _encoding_map:
  129. if text.startswith(bom):
  130. decoded = text[len(bom):].decode(encoding, 'replace')
  131. break
  132. # no BOM found, so use chardet
  133. if decoded is None:
  134. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  135. decoded = text.decode(enc.get('encoding') or 'utf-8',
  136. 'replace')
  137. text = decoded
  138. else:
  139. text = text.decode(self.encoding)
  140. if text.startswith(u'\ufeff'):
  141. text = text[len(u'\ufeff'):]
  142. else:
  143. if text.startswith(u'\ufeff'):
  144. text = text[len(u'\ufeff'):]
  145. # text now *is* a unicode string
  146. text = text.replace('\r\n', '\n')
  147. text = text.replace('\r', '\n')
  148. if self.stripall:
  149. text = text.strip()
  150. elif self.stripnl:
  151. text = text.strip('\n')
  152. if self.tabsize > 0:
  153. text = text.expandtabs(self.tabsize)
  154. if self.ensurenl and not text.endswith('\n'):
  155. text += '\n'
  156. def streamer():
  157. for _, t, v in self.get_tokens_unprocessed(text):
  158. yield t, v
  159. stream = streamer()
  160. if not unfiltered:
  161. stream = apply_filters(stream, self.filters, self)
  162. return stream
  163. def get_tokens_unprocessed(self, text):
  164. """
  165. Return an iterable of (index, tokentype, value) pairs where "index"
  166. is the starting position of the token within the input text.
  167. In subclasses, implement this method as a generator to
  168. maximize effectiveness.
  169. """
  170. raise NotImplementedError
  171. class DelegatingLexer(Lexer):
  172. """
  173. This lexer takes two lexer as arguments. A root lexer and
  174. a language lexer. First everything is scanned using the language
  175. lexer, afterwards all ``Other`` tokens are lexed using the root
  176. lexer.
  177. The lexers from the ``template`` lexer package use this base lexer.
  178. """
  179. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  180. self.root_lexer = _root_lexer(**options)
  181. self.language_lexer = _language_lexer(**options)
  182. self.needle = _needle
  183. Lexer.__init__(self, **options)
  184. def get_tokens_unprocessed(self, text):
  185. buffered = ''
  186. insertions = []
  187. lng_buffer = []
  188. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  189. if t is self.needle:
  190. if lng_buffer:
  191. insertions.append((len(buffered), lng_buffer))
  192. lng_buffer = []
  193. buffered += v
  194. else:
  195. lng_buffer.append((i, t, v))
  196. if lng_buffer:
  197. insertions.append((len(buffered), lng_buffer))
  198. return do_insertions(insertions,
  199. self.root_lexer.get_tokens_unprocessed(buffered))
  200. # ------------------------------------------------------------------------------
  201. # RegexLexer and ExtendedRegexLexer
  202. #
  203. class include(str): # pylint: disable=invalid-name
  204. """
  205. Indicates that a state should include rules from another state.
  206. """
  207. pass
  208. class _inherit:
  209. """
  210. Indicates the a state should inherit from its superclass.
  211. """
  212. def __repr__(self):
  213. return 'inherit'
  214. inherit = _inherit() # pylint: disable=invalid-name
  215. class combined(tuple): # pylint: disable=invalid-name
  216. """
  217. Indicates a state combined from multiple states.
  218. """
  219. def __new__(cls, *args):
  220. return tuple.__new__(cls, args)
  221. def __init__(self, *args):
  222. # tuple.__init__ doesn't do anything
  223. pass
  224. class _PseudoMatch:
  225. """
  226. A pseudo match object constructed from a string.
  227. """
  228. def __init__(self, start, text):
  229. self._text = text
  230. self._start = start
  231. def start(self, arg=None):
  232. return self._start
  233. def end(self, arg=None):
  234. return self._start + len(self._text)
  235. def group(self, arg=None):
  236. if arg:
  237. raise IndexError('No such group')
  238. return self._text
  239. def groups(self):
  240. return (self._text,)
  241. def groupdict(self):
  242. return {}
  243. def bygroups(*args):
  244. """
  245. Callback that yields multiple actions for each group in the match.
  246. """
  247. def callback(lexer, match, ctx=None):
  248. for i, action in enumerate(args):
  249. if action is None:
  250. continue
  251. elif type(action) is _TokenType:
  252. data = match.group(i + 1)
  253. if data:
  254. yield match.start(i + 1), action, data
  255. else:
  256. data = match.group(i + 1)
  257. if data is not None:
  258. if ctx:
  259. ctx.pos = match.start(i + 1)
  260. for item in action(lexer,
  261. _PseudoMatch(match.start(i + 1), data), ctx):
  262. if item:
  263. yield item
  264. if ctx:
  265. ctx.pos = match.end()
  266. return callback
  267. class _This:
  268. """
  269. Special singleton used for indicating the caller class.
  270. Used by ``using``.
  271. """
  272. this = _This()
  273. def using(_other, **kwargs):
  274. """
  275. Callback that processes the match with a different lexer.
  276. The keyword arguments are forwarded to the lexer, except `state` which
  277. is handled separately.
  278. `state` specifies the state that the new lexer will start in, and can
  279. be an enumerable such as ('root', 'inline', 'string') or a simple
  280. string which is assumed to be on top of the root state.
  281. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  282. """
  283. gt_kwargs = {}
  284. if 'state' in kwargs:
  285. s = kwargs.pop('state')
  286. if isinstance(s, (list, tuple)):
  287. gt_kwargs['stack'] = s
  288. else:
  289. gt_kwargs['stack'] = ('root', s)
  290. if _other is this:
  291. def callback(lexer, match, ctx=None):
  292. # if keyword arguments are given the callback
  293. # function has to create a new lexer instance
  294. if kwargs:
  295. # XXX: cache that somehow
  296. kwargs.update(lexer.options)
  297. lx = lexer.__class__(**kwargs)
  298. else:
  299. lx = lexer
  300. s = match.start()
  301. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  302. yield i + s, t, v
  303. if ctx:
  304. ctx.pos = match.end()
  305. else:
  306. def callback(lexer, match, ctx=None):
  307. # XXX: cache that somehow
  308. kwargs.update(lexer.options)
  309. lx = _other(**kwargs)
  310. s = match.start()
  311. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  312. yield i + s, t, v
  313. if ctx:
  314. ctx.pos = match.end()
  315. return callback
  316. class default:
  317. """
  318. Indicates a state or state action (e.g. #pop) to apply.
  319. For example default('#pop') is equivalent to ('', Token, '#pop')
  320. Note that state tuples may be used as well.
  321. .. versionadded:: 2.0
  322. """
  323. def __init__(self, state):
  324. self.state = state
  325. class words(Future):
  326. """
  327. Indicates a list of literal words that is transformed into an optimized
  328. regex that matches any of the words.
  329. .. versionadded:: 2.0
  330. """
  331. def __init__(self, words, prefix='', suffix=''):
  332. self.words = words
  333. self.prefix = prefix
  334. self.suffix = suffix
  335. def get(self):
  336. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  337. class RegexLexerMeta(LexerMeta):
  338. """
  339. Metaclass for RegexLexer, creates the self._tokens attribute from
  340. self.tokens on the first instantiation.
  341. """
  342. def _process_regex(cls, regex, rflags, state):
  343. """Preprocess the regular expression component of a token definition."""
  344. if isinstance(regex, Future):
  345. regex = regex.get()
  346. return re.compile(regex, rflags).match
  347. def _process_token(cls, token):
  348. """Preprocess the token component of a token definition."""
  349. assert type(token) is _TokenType or callable(token), \
  350. 'token type must be simple type or callable, not %r' % (token,)
  351. return token
  352. def _process_new_state(cls, new_state, unprocessed, processed):
  353. """Preprocess the state transition action of a token definition."""
  354. if isinstance(new_state, str):
  355. # an existing state
  356. if new_state == '#pop':
  357. return -1
  358. elif new_state in unprocessed:
  359. return (new_state,)
  360. elif new_state == '#push':
  361. return new_state
  362. elif new_state[:5] == '#pop:':
  363. return -int(new_state[5:])
  364. else:
  365. assert False, 'unknown new state %r' % new_state
  366. elif isinstance(new_state, combined):
  367. # combine a new state from existing ones
  368. tmp_state = '_tmp_%d' % cls._tmpname
  369. cls._tmpname += 1
  370. itokens = []
  371. for istate in new_state:
  372. assert istate != new_state, 'circular state ref %r' % istate
  373. itokens.extend(cls._process_state(unprocessed,
  374. processed, istate))
  375. processed[tmp_state] = itokens
  376. return (tmp_state,)
  377. elif isinstance(new_state, tuple):
  378. # push more than one state
  379. for istate in new_state:
  380. assert (istate in unprocessed or
  381. istate in ('#pop', '#push')), \
  382. 'unknown new state ' + istate
  383. return new_state
  384. else:
  385. assert False, 'unknown new state def %r' % new_state
  386. def _process_state(cls, unprocessed, processed, state):
  387. """Preprocess a single state definition."""
  388. assert type(state) is str, "wrong state name %r" % state
  389. assert state[0] != '#', "invalid state name %r" % state
  390. if state in processed:
  391. return processed[state]
  392. tokens = processed[state] = []
  393. rflags = cls.flags
  394. for tdef in unprocessed[state]:
  395. if isinstance(tdef, include):
  396. # it's a state reference
  397. assert tdef != state, "circular state reference %r" % state
  398. tokens.extend(cls._process_state(unprocessed, processed,
  399. str(tdef)))
  400. continue
  401. if isinstance(tdef, _inherit):
  402. # should be processed already, but may not in the case of:
  403. # 1. the state has no counterpart in any parent
  404. # 2. the state includes more than one 'inherit'
  405. continue
  406. if isinstance(tdef, default):
  407. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  408. tokens.append((re.compile('').match, None, new_state))
  409. continue
  410. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  411. try:
  412. rex = cls._process_regex(tdef[0], rflags, state)
  413. except Exception as err:
  414. raise ValueError("uncompilable regex %r in state %r of %r: %s" %
  415. (tdef[0], state, cls, err))
  416. token = cls._process_token(tdef[1])
  417. if len(tdef) == 2:
  418. new_state = None
  419. else:
  420. new_state = cls._process_new_state(tdef[2],
  421. unprocessed, processed)
  422. tokens.append((rex, token, new_state))
  423. return tokens
  424. def process_tokendef(cls, name, tokendefs=None):
  425. """Preprocess a dictionary of token definitions."""
  426. processed = cls._all_tokens[name] = {}
  427. tokendefs = tokendefs or cls.tokens[name]
  428. for state in list(tokendefs):
  429. cls._process_state(tokendefs, processed, state)
  430. return processed
  431. def get_tokendefs(cls):
  432. """
  433. Merge tokens from superclasses in MRO order, returning a single tokendef
  434. dictionary.
  435. Any state that is not defined by a subclass will be inherited
  436. automatically. States that *are* defined by subclasses will, by
  437. default, override that state in the superclass. If a subclass wishes to
  438. inherit definitions from a superclass, it can use the special value
  439. "inherit", which will cause the superclass' state definition to be
  440. included at that point in the state.
  441. """
  442. tokens = {}
  443. inheritable = {}
  444. for c in cls.__mro__:
  445. toks = c.__dict__.get('tokens', {})
  446. for state, items in toks.items():
  447. curitems = tokens.get(state)
  448. if curitems is None:
  449. # N.b. because this is assigned by reference, sufficiently
  450. # deep hierarchies are processed incrementally (e.g. for
  451. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  452. # will not see any inherits in B).
  453. tokens[state] = items
  454. try:
  455. inherit_ndx = items.index(inherit)
  456. except ValueError:
  457. continue
  458. inheritable[state] = inherit_ndx
  459. continue
  460. inherit_ndx = inheritable.pop(state, None)
  461. if inherit_ndx is None:
  462. continue
  463. # Replace the "inherit" value with the items
  464. curitems[inherit_ndx:inherit_ndx+1] = items
  465. try:
  466. # N.b. this is the index in items (that is, the superclass
  467. # copy), so offset required when storing below.
  468. new_inh_ndx = items.index(inherit)
  469. except ValueError:
  470. pass
  471. else:
  472. inheritable[state] = inherit_ndx + new_inh_ndx
  473. return tokens
  474. def __call__(cls, *args, **kwds):
  475. """Instantiate cls after preprocessing its token definitions."""
  476. if '_tokens' not in cls.__dict__:
  477. cls._all_tokens = {}
  478. cls._tmpname = 0
  479. if hasattr(cls, 'token_variants') and cls.token_variants:
  480. # don't process yet
  481. pass
  482. else:
  483. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  484. return type.__call__(cls, *args, **kwds)
  485. class RegexLexer(Lexer, metaclass=RegexLexerMeta):
  486. """
  487. Base for simple stateful regular expression-based lexers.
  488. Simplifies the lexing process so that you need only
  489. provide a list of states and regular expressions.
  490. """
  491. #: Flags for compiling the regular expressions.
  492. #: Defaults to MULTILINE.
  493. flags = re.MULTILINE
  494. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  495. #:
  496. #: The initial state is 'root'.
  497. #: ``new_state`` can be omitted to signify no state transition.
  498. #: If it is a string, the state is pushed on the stack and changed.
  499. #: If it is a tuple of strings, all states are pushed on the stack and
  500. #: the current state will be the topmost.
  501. #: It can also be ``combined('state1', 'state2', ...)``
  502. #: to signify a new, anonymous state combined from the rules of two
  503. #: or more existing ones.
  504. #: Furthermore, it can be '#pop' to signify going back one step in
  505. #: the state stack, or '#push' to push the current state on the stack
  506. #: again.
  507. #:
  508. #: The tuple can also be replaced with ``include('state')``, in which
  509. #: case the rules from the state named by the string are included in the
  510. #: current one.
  511. tokens = {}
  512. def get_tokens_unprocessed(self, text, stack=('root',)):
  513. """
  514. Split ``text`` into (tokentype, text) pairs.
  515. ``stack`` is the inital stack (default: ``['root']``)
  516. """
  517. pos = 0
  518. tokendefs = self._tokens
  519. statestack = list(stack)
  520. statetokens = tokendefs[statestack[-1]]
  521. while 1:
  522. for rexmatch, action, new_state in statetokens:
  523. m = rexmatch(text, pos)
  524. if m:
  525. if action is not None:
  526. if type(action) is _TokenType:
  527. yield pos, action, m.group()
  528. else:
  529. for item in action(self, m):
  530. yield item
  531. pos = m.end()
  532. if new_state is not None:
  533. # state transition
  534. if isinstance(new_state, tuple):
  535. for state in new_state:
  536. if state == '#pop':
  537. if len(statestack) > 1:
  538. statestack.pop()
  539. elif state == '#push':
  540. statestack.append(statestack[-1])
  541. else:
  542. statestack.append(state)
  543. elif isinstance(new_state, int):
  544. # pop, but keep at least one state on the stack
  545. # (random code leading to unexpected pops should
  546. # not allow exceptions)
  547. if abs(new_state) >= len(statestack):
  548. del statestack[1:]
  549. else:
  550. del statestack[new_state:]
  551. elif new_state == '#push':
  552. statestack.append(statestack[-1])
  553. else:
  554. assert False, "wrong state def: %r" % new_state
  555. statetokens = tokendefs[statestack[-1]]
  556. break
  557. else:
  558. # We are here only if all state tokens have been considered
  559. # and there was not a match on any of them.
  560. try:
  561. if text[pos] == '\n':
  562. # at EOL, reset state to "root"
  563. statestack = ['root']
  564. statetokens = tokendefs['root']
  565. yield pos, Text, u'\n'
  566. pos += 1
  567. continue
  568. yield pos, Error, text[pos]
  569. pos += 1
  570. except IndexError:
  571. break
  572. class LexerContext:
  573. """
  574. A helper object that holds lexer position data.
  575. """
  576. def __init__(self, text, pos, stack=None, end=None):
  577. self.text = text
  578. self.pos = pos
  579. self.end = end or len(text) # end=0 not supported ;-)
  580. self.stack = stack or ['root']
  581. def __repr__(self):
  582. return 'LexerContext(%r, %r, %r)' % (
  583. self.text, self.pos, self.stack)
  584. class ExtendedRegexLexer(RegexLexer):
  585. """
  586. A RegexLexer that uses a context object to store its state.
  587. """
  588. def get_tokens_unprocessed(self, text=None, context=None):
  589. """
  590. Split ``text`` into (tokentype, text) pairs.
  591. If ``context`` is given, use this lexer context instead.
  592. """
  593. tokendefs = self._tokens
  594. if not context:
  595. ctx = LexerContext(text, 0)
  596. statetokens = tokendefs['root']
  597. else:
  598. ctx = context
  599. statetokens = tokendefs[ctx.stack[-1]]
  600. text = ctx.text
  601. while 1:
  602. for rexmatch, action, new_state in statetokens:
  603. m = rexmatch(text, ctx.pos, ctx.end)
  604. if m:
  605. if action is not None:
  606. if type(action) is _TokenType:
  607. yield ctx.pos, action, m.group()
  608. ctx.pos = m.end()
  609. else:
  610. for item in action(self, m, ctx):
  611. yield item
  612. if not new_state:
  613. # altered the state stack?
  614. statetokens = tokendefs[ctx.stack[-1]]
  615. # CAUTION: callback must set ctx.pos!
  616. if new_state is not None:
  617. # state transition
  618. if isinstance(new_state, tuple):
  619. for state in new_state:
  620. if state == '#pop':
  621. if len(ctx.stack) > 1:
  622. ctx.stack.pop()
  623. elif state == '#push':
  624. ctx.stack.append(ctx.stack[-1])
  625. else:
  626. ctx.stack.append(state)
  627. elif isinstance(new_state, int):
  628. # see RegexLexer for why this check is made
  629. if abs(new_state) >= len(ctx.stack):
  630. del ctx.state[1:]
  631. else:
  632. del ctx.stack[new_state:]
  633. elif new_state == '#push':
  634. ctx.stack.append(ctx.stack[-1])
  635. else:
  636. assert False, "wrong state def: %r" % new_state
  637. statetokens = tokendefs[ctx.stack[-1]]
  638. break
  639. else:
  640. try:
  641. if ctx.pos >= ctx.end:
  642. break
  643. if text[ctx.pos] == '\n':
  644. # at EOL, reset state to "root"
  645. ctx.stack = ['root']
  646. statetokens = tokendefs['root']
  647. yield ctx.pos, Text, u'\n'
  648. ctx.pos += 1
  649. continue
  650. yield ctx.pos, Error, text[ctx.pos]
  651. ctx.pos += 1
  652. except IndexError:
  653. break
  654. def do_insertions(insertions, tokens):
  655. """
  656. Helper for lexers which must combine the results of several
  657. sublexers.
  658. ``insertions`` is a list of ``(index, itokens)`` pairs.
  659. Each ``itokens`` iterable should be inserted at position
  660. ``index`` into the token stream given by the ``tokens``
  661. argument.
  662. The result is a combined token stream.
  663. TODO: clean up the code here.
  664. """
  665. insertions = iter(insertions)
  666. try:
  667. index, itokens = next(insertions)
  668. except StopIteration:
  669. # no insertions
  670. for item in tokens:
  671. yield item
  672. return
  673. realpos = None
  674. insleft = True
  675. # iterate over the token stream where we want to insert
  676. # the tokens from the insertion list.
  677. for i, t, v in tokens:
  678. # first iteration. store the postition of first item
  679. if realpos is None:
  680. realpos = i
  681. oldi = 0
  682. while insleft and i + len(v) >= index:
  683. tmpval = v[oldi:index - i]
  684. yield realpos, t, tmpval
  685. realpos += len(tmpval)
  686. for it_index, it_token, it_value in itokens:
  687. yield realpos, it_token, it_value
  688. realpos += len(it_value)
  689. oldi = index - i
  690. try:
  691. index, itokens = next(insertions)
  692. except StopIteration:
  693. insleft = False
  694. break # not strictly necessary
  695. yield realpos, t, v[oldi:]
  696. realpos += len(v) - oldi
  697. # leftover tokens
  698. while insleft:
  699. # no normal tokens, set realpos to zero
  700. realpos = realpos or 0
  701. for p, t, v in itokens:
  702. yield realpos, t, v
  703. realpos += len(v)
  704. try:
  705. index, itokens = next(insertions)
  706. except StopIteration:
  707. insleft = False
  708. break # not strictly necessary
  709. class ProfilingRegexLexerMeta(RegexLexerMeta):
  710. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  711. def _process_regex(cls, regex, rflags, state):
  712. if isinstance(regex, words):
  713. rex = regex_opt(regex.words, prefix=regex.prefix,
  714. suffix=regex.suffix)
  715. else:
  716. rex = regex
  717. compiled = re.compile(rex, rflags)
  718. def match_func(text, pos, endpos=sys.maxsize):
  719. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  720. t0 = time.time()
  721. res = compiled.match(text, pos, endpos)
  722. t1 = time.time()
  723. info[0] += 1
  724. info[1] += t1 - t0
  725. return res
  726. return match_func
  727. class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
  728. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  729. _prof_data = []
  730. _prof_sort_index = 4 # defaults to time per call
  731. def get_tokens_unprocessed(self, text, stack=('root',)):
  732. # this needs to be a stack, since using(this) will produce nested calls
  733. self.__class__._prof_data.append({})
  734. for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
  735. yield tok
  736. rawdata = self.__class__._prof_data.pop()
  737. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  738. n, 1000 * t, 1000 * t / n)
  739. for ((s, r), (n, t)) in rawdata.items()),
  740. key=lambda x: x[self._prof_sort_index],
  741. reverse=True)
  742. sum_total = sum(x[3] for x in data)
  743. print()
  744. print('Profiling result for %s lexing %d chars in %.3f ms' %
  745. (self.__class__.__name__, len(text), sum_total))
  746. print('=' * 110)
  747. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  748. print('-' * 110)
  749. for d in data:
  750. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  751. print('=' * 110)