208 lines
8.4 KiB
Python
208 lines
8.4 KiB
Python
import string
|
|
import warnings
|
|
from json import loads
|
|
|
|
from jmespath.exceptions import LexerError, EmptyExpressionError
|
|
|
|
|
|
class Lexer(object):
|
|
START_IDENTIFIER = set(string.ascii_letters + '_')
|
|
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
|
|
VALID_NUMBER = set(string.digits)
|
|
WHITESPACE = set(" \t\n\r")
|
|
SIMPLE_TOKENS = {
|
|
'.': 'dot',
|
|
'*': 'star',
|
|
']': 'rbracket',
|
|
',': 'comma',
|
|
':': 'colon',
|
|
'@': 'current',
|
|
'(': 'lparen',
|
|
')': 'rparen',
|
|
'{': 'lbrace',
|
|
'}': 'rbrace',
|
|
}
|
|
|
|
def tokenize(self, expression):
|
|
self._initialize_for_expression(expression)
|
|
while self._current is not None:
|
|
if self._current in self.SIMPLE_TOKENS:
|
|
yield {'type': self.SIMPLE_TOKENS[self._current],
|
|
'value': self._current,
|
|
'start': self._position, 'end': self._position + 1}
|
|
self._next()
|
|
elif self._current in self.START_IDENTIFIER:
|
|
start = self._position
|
|
buff = self._current
|
|
while self._next() in self.VALID_IDENTIFIER:
|
|
buff += self._current
|
|
yield {'type': 'unquoted_identifier', 'value': buff,
|
|
'start': start, 'end': start + len(buff)}
|
|
elif self._current in self.WHITESPACE:
|
|
self._next()
|
|
elif self._current == '[':
|
|
start = self._position
|
|
next_char = self._next()
|
|
if next_char == ']':
|
|
self._next()
|
|
yield {'type': 'flatten', 'value': '[]',
|
|
'start': start, 'end': start + 2}
|
|
elif next_char == '?':
|
|
self._next()
|
|
yield {'type': 'filter', 'value': '[?',
|
|
'start': start, 'end': start + 2}
|
|
else:
|
|
yield {'type': 'lbracket', 'value': '[',
|
|
'start': start, 'end': start + 1}
|
|
elif self._current == "'":
|
|
yield self._consume_raw_string_literal()
|
|
elif self._current == '|':
|
|
yield self._match_or_else('|', 'or', 'pipe')
|
|
elif self._current == '&':
|
|
yield self._match_or_else('&', 'and', 'expref')
|
|
elif self._current == '`':
|
|
yield self._consume_literal()
|
|
elif self._current in self.VALID_NUMBER:
|
|
start = self._position
|
|
buff = self._consume_number()
|
|
yield {'type': 'number', 'value': int(buff),
|
|
'start': start, 'end': start + len(buff)}
|
|
elif self._current == '-':
|
|
# Negative number.
|
|
start = self._position
|
|
buff = self._consume_number()
|
|
if len(buff) > 1:
|
|
yield {'type': 'number', 'value': int(buff),
|
|
'start': start, 'end': start + len(buff)}
|
|
else:
|
|
raise LexerError(lexer_position=start,
|
|
lexer_value=buff,
|
|
message="Unknown token '%s'" % buff)
|
|
elif self._current == '"':
|
|
yield self._consume_quoted_identifier()
|
|
elif self._current == '<':
|
|
yield self._match_or_else('=', 'lte', 'lt')
|
|
elif self._current == '>':
|
|
yield self._match_or_else('=', 'gte', 'gt')
|
|
elif self._current == '!':
|
|
yield self._match_or_else('=', 'ne', 'not')
|
|
elif self._current == '=':
|
|
if self._next() == '=':
|
|
yield {'type': 'eq', 'value': '==',
|
|
'start': self._position - 1, 'end': self._position}
|
|
self._next()
|
|
else:
|
|
if self._current is None:
|
|
# If we're at the EOF, we never advanced
|
|
# the position so we don't need to rewind
|
|
# it back one location.
|
|
position = self._position
|
|
else:
|
|
position = self._position - 1
|
|
raise LexerError(
|
|
lexer_position=position,
|
|
lexer_value='=',
|
|
message="Unknown token '='")
|
|
else:
|
|
raise LexerError(lexer_position=self._position,
|
|
lexer_value=self._current,
|
|
message="Unknown token %s" % self._current)
|
|
yield {'type': 'eof', 'value': '',
|
|
'start': self._length, 'end': self._length}
|
|
|
|
def _consume_number(self):
|
|
start = self._position
|
|
buff = self._current
|
|
while self._next() in self.VALID_NUMBER:
|
|
buff += self._current
|
|
return buff
|
|
|
|
def _initialize_for_expression(self, expression):
|
|
if not expression:
|
|
raise EmptyExpressionError()
|
|
self._position = 0
|
|
self._expression = expression
|
|
self._chars = list(self._expression)
|
|
self._current = self._chars[self._position]
|
|
self._length = len(self._expression)
|
|
|
|
def _next(self):
|
|
if self._position == self._length - 1:
|
|
self._current = None
|
|
else:
|
|
self._position += 1
|
|
self._current = self._chars[self._position]
|
|
return self._current
|
|
|
|
def _consume_until(self, delimiter):
|
|
# Consume until the delimiter is reached,
|
|
# allowing for the delimiter to be escaped with "\".
|
|
start = self._position
|
|
buff = ''
|
|
self._next()
|
|
while self._current != delimiter:
|
|
if self._current == '\\':
|
|
buff += '\\'
|
|
self._next()
|
|
if self._current is None:
|
|
# We're at the EOF.
|
|
raise LexerError(lexer_position=start,
|
|
lexer_value=self._expression[start:],
|
|
message="Unclosed %s delimiter" % delimiter)
|
|
buff += self._current
|
|
self._next()
|
|
# Skip the closing delimiter.
|
|
self._next()
|
|
return buff
|
|
|
|
def _consume_literal(self):
|
|
start = self._position
|
|
lexeme = self._consume_until('`').replace('\\`', '`')
|
|
try:
|
|
# Assume it is valid JSON and attempt to parse.
|
|
parsed_json = loads(lexeme)
|
|
except ValueError:
|
|
try:
|
|
# Invalid JSON values should be converted to quoted
|
|
# JSON strings during the JEP-12 deprecation period.
|
|
parsed_json = loads('"%s"' % lexeme.lstrip())
|
|
warnings.warn("deprecated string literal syntax",
|
|
PendingDeprecationWarning)
|
|
except ValueError:
|
|
raise LexerError(lexer_position=start,
|
|
lexer_value=self._expression[start:],
|
|
message="Bad token %s" % lexeme)
|
|
token_len = self._position - start
|
|
return {'type': 'literal', 'value': parsed_json,
|
|
'start': start, 'end': token_len}
|
|
|
|
def _consume_quoted_identifier(self):
|
|
start = self._position
|
|
lexeme = '"' + self._consume_until('"') + '"'
|
|
try:
|
|
token_len = self._position - start
|
|
return {'type': 'quoted_identifier', 'value': loads(lexeme),
|
|
'start': start, 'end': token_len}
|
|
except ValueError as e:
|
|
error_message = str(e).split(':')[0]
|
|
raise LexerError(lexer_position=start,
|
|
lexer_value=lexeme,
|
|
message=error_message)
|
|
|
|
def _consume_raw_string_literal(self):
|
|
start = self._position
|
|
lexeme = self._consume_until("'").replace("\\'", "'")
|
|
token_len = self._position - start
|
|
return {'type': 'literal', 'value': lexeme,
|
|
'start': start, 'end': token_len}
|
|
|
|
def _match_or_else(self, expected, match_type, else_type):
|
|
start = self._position
|
|
current = self._current
|
|
next_char = self._next()
|
|
if next_char == expected:
|
|
self._next()
|
|
return {'type': match_type, 'value': current + next_char,
|
|
'start': start, 'end': start + 1}
|
|
return {'type': else_type, 'value': current,
|
|
'start': start, 'end': start}
|