You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

208 lines
8.4 KiB

4 years ago
  1. import string
  2. import warnings
  3. from json import loads
  4. from jmespath.exceptions import LexerError, EmptyExpressionError
  5. class Lexer(object):
  6. START_IDENTIFIER = set(string.ascii_letters + '_')
  7. VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
  8. VALID_NUMBER = set(string.digits)
  9. WHITESPACE = set(" \t\n\r")
  10. SIMPLE_TOKENS = {
  11. '.': 'dot',
  12. '*': 'star',
  13. ']': 'rbracket',
  14. ',': 'comma',
  15. ':': 'colon',
  16. '@': 'current',
  17. '(': 'lparen',
  18. ')': 'rparen',
  19. '{': 'lbrace',
  20. '}': 'rbrace',
  21. }
  22. def tokenize(self, expression):
  23. self._initialize_for_expression(expression)
  24. while self._current is not None:
  25. if self._current in self.SIMPLE_TOKENS:
  26. yield {'type': self.SIMPLE_TOKENS[self._current],
  27. 'value': self._current,
  28. 'start': self._position, 'end': self._position + 1}
  29. self._next()
  30. elif self._current in self.START_IDENTIFIER:
  31. start = self._position
  32. buff = self._current
  33. while self._next() in self.VALID_IDENTIFIER:
  34. buff += self._current
  35. yield {'type': 'unquoted_identifier', 'value': buff,
  36. 'start': start, 'end': start + len(buff)}
  37. elif self._current in self.WHITESPACE:
  38. self._next()
  39. elif self._current == '[':
  40. start = self._position
  41. next_char = self._next()
  42. if next_char == ']':
  43. self._next()
  44. yield {'type': 'flatten', 'value': '[]',
  45. 'start': start, 'end': start + 2}
  46. elif next_char == '?':
  47. self._next()
  48. yield {'type': 'filter', 'value': '[?',
  49. 'start': start, 'end': start + 2}
  50. else:
  51. yield {'type': 'lbracket', 'value': '[',
  52. 'start': start, 'end': start + 1}
  53. elif self._current == "'":
  54. yield self._consume_raw_string_literal()
  55. elif self._current == '|':
  56. yield self._match_or_else('|', 'or', 'pipe')
  57. elif self._current == '&':
  58. yield self._match_or_else('&', 'and', 'expref')
  59. elif self._current == '`':
  60. yield self._consume_literal()
  61. elif self._current in self.VALID_NUMBER:
  62. start = self._position
  63. buff = self._consume_number()
  64. yield {'type': 'number', 'value': int(buff),
  65. 'start': start, 'end': start + len(buff)}
  66. elif self._current == '-':
  67. # Negative number.
  68. start = self._position
  69. buff = self._consume_number()
  70. if len(buff) > 1:
  71. yield {'type': 'number', 'value': int(buff),
  72. 'start': start, 'end': start + len(buff)}
  73. else:
  74. raise LexerError(lexer_position=start,
  75. lexer_value=buff,
  76. message="Unknown token '%s'" % buff)
  77. elif self._current == '"':
  78. yield self._consume_quoted_identifier()
  79. elif self._current == '<':
  80. yield self._match_or_else('=', 'lte', 'lt')
  81. elif self._current == '>':
  82. yield self._match_or_else('=', 'gte', 'gt')
  83. elif self._current == '!':
  84. yield self._match_or_else('=', 'ne', 'not')
  85. elif self._current == '=':
  86. if self._next() == '=':
  87. yield {'type': 'eq', 'value': '==',
  88. 'start': self._position - 1, 'end': self._position}
  89. self._next()
  90. else:
  91. if self._current is None:
  92. # If we're at the EOF, we never advanced
  93. # the position so we don't need to rewind
  94. # it back one location.
  95. position = self._position
  96. else:
  97. position = self._position - 1
  98. raise LexerError(
  99. lexer_position=position,
  100. lexer_value='=',
  101. message="Unknown token '='")
  102. else:
  103. raise LexerError(lexer_position=self._position,
  104. lexer_value=self._current,
  105. message="Unknown token %s" % self._current)
  106. yield {'type': 'eof', 'value': '',
  107. 'start': self._length, 'end': self._length}
  108. def _consume_number(self):
  109. start = self._position
  110. buff = self._current
  111. while self._next() in self.VALID_NUMBER:
  112. buff += self._current
  113. return buff
  114. def _initialize_for_expression(self, expression):
  115. if not expression:
  116. raise EmptyExpressionError()
  117. self._position = 0
  118. self._expression = expression
  119. self._chars = list(self._expression)
  120. self._current = self._chars[self._position]
  121. self._length = len(self._expression)
  122. def _next(self):
  123. if self._position == self._length - 1:
  124. self._current = None
  125. else:
  126. self._position += 1
  127. self._current = self._chars[self._position]
  128. return self._current
  129. def _consume_until(self, delimiter):
  130. # Consume until the delimiter is reached,
  131. # allowing for the delimiter to be escaped with "\".
  132. start = self._position
  133. buff = ''
  134. self._next()
  135. while self._current != delimiter:
  136. if self._current == '\\':
  137. buff += '\\'
  138. self._next()
  139. if self._current is None:
  140. # We're at the EOF.
  141. raise LexerError(lexer_position=start,
  142. lexer_value=self._expression[start:],
  143. message="Unclosed %s delimiter" % delimiter)
  144. buff += self._current
  145. self._next()
  146. # Skip the closing delimiter.
  147. self._next()
  148. return buff
  149. def _consume_literal(self):
  150. start = self._position
  151. lexeme = self._consume_until('`').replace('\\`', '`')
  152. try:
  153. # Assume it is valid JSON and attempt to parse.
  154. parsed_json = loads(lexeme)
  155. except ValueError:
  156. try:
  157. # Invalid JSON values should be converted to quoted
  158. # JSON strings during the JEP-12 deprecation period.
  159. parsed_json = loads('"%s"' % lexeme.lstrip())
  160. warnings.warn("deprecated string literal syntax",
  161. PendingDeprecationWarning)
  162. except ValueError:
  163. raise LexerError(lexer_position=start,
  164. lexer_value=self._expression[start:],
  165. message="Bad token %s" % lexeme)
  166. token_len = self._position - start
  167. return {'type': 'literal', 'value': parsed_json,
  168. 'start': start, 'end': token_len}
  169. def _consume_quoted_identifier(self):
  170. start = self._position
  171. lexeme = '"' + self._consume_until('"') + '"'
  172. try:
  173. token_len = self._position - start
  174. return {'type': 'quoted_identifier', 'value': loads(lexeme),
  175. 'start': start, 'end': token_len}
  176. except ValueError as e:
  177. error_message = str(e).split(':')[0]
  178. raise LexerError(lexer_position=start,
  179. lexer_value=lexeme,
  180. message=error_message)
  181. def _consume_raw_string_literal(self):
  182. start = self._position
  183. lexeme = self._consume_until("'").replace("\\'", "'")
  184. token_len = self._position - start
  185. return {'type': 'literal', 'value': lexeme,
  186. 'start': start, 'end': token_len}
  187. def _match_or_else(self, expected, match_type, else_type):
  188. start = self._position
  189. current = self._current
  190. next_char = self._next()
  191. if next_char == expected:
  192. self._next()
  193. return {'type': match_type, 'value': current + next_char,
  194. 'start': start, 'end': start + 1}
  195. return {'type': else_type, 'value': current,
  196. 'start': start, 'end': start}