|
|
- import string
- import warnings
- from json import loads
-
- from jmespath.exceptions import LexerError, EmptyExpressionError
-
-
- class Lexer(object):
- START_IDENTIFIER = set(string.ascii_letters + '_')
- VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
- VALID_NUMBER = set(string.digits)
- WHITESPACE = set(" \t\n\r")
- SIMPLE_TOKENS = {
- '.': 'dot',
- '*': 'star',
- ']': 'rbracket',
- ',': 'comma',
- ':': 'colon',
- '@': 'current',
- '(': 'lparen',
- ')': 'rparen',
- '{': 'lbrace',
- '}': 'rbrace',
- }
-
- def tokenize(self, expression):
- self._initialize_for_expression(expression)
- while self._current is not None:
- if self._current in self.SIMPLE_TOKENS:
- yield {'type': self.SIMPLE_TOKENS[self._current],
- 'value': self._current,
- 'start': self._position, 'end': self._position + 1}
- self._next()
- elif self._current in self.START_IDENTIFIER:
- start = self._position
- buff = self._current
- while self._next() in self.VALID_IDENTIFIER:
- buff += self._current
- yield {'type': 'unquoted_identifier', 'value': buff,
- 'start': start, 'end': start + len(buff)}
- elif self._current in self.WHITESPACE:
- self._next()
- elif self._current == '[':
- start = self._position
- next_char = self._next()
- if next_char == ']':
- self._next()
- yield {'type': 'flatten', 'value': '[]',
- 'start': start, 'end': start + 2}
- elif next_char == '?':
- self._next()
- yield {'type': 'filter', 'value': '[?',
- 'start': start, 'end': start + 2}
- else:
- yield {'type': 'lbracket', 'value': '[',
- 'start': start, 'end': start + 1}
- elif self._current == "'":
- yield self._consume_raw_string_literal()
- elif self._current == '|':
- yield self._match_or_else('|', 'or', 'pipe')
- elif self._current == '&':
- yield self._match_or_else('&', 'and', 'expref')
- elif self._current == '`':
- yield self._consume_literal()
- elif self._current in self.VALID_NUMBER:
- start = self._position
- buff = self._consume_number()
- yield {'type': 'number', 'value': int(buff),
- 'start': start, 'end': start + len(buff)}
- elif self._current == '-':
- # Negative number.
- start = self._position
- buff = self._consume_number()
- if len(buff) > 1:
- yield {'type': 'number', 'value': int(buff),
- 'start': start, 'end': start + len(buff)}
- else:
- raise LexerError(lexer_position=start,
- lexer_value=buff,
- message="Unknown token '%s'" % buff)
- elif self._current == '"':
- yield self._consume_quoted_identifier()
- elif self._current == '<':
- yield self._match_or_else('=', 'lte', 'lt')
- elif self._current == '>':
- yield self._match_or_else('=', 'gte', 'gt')
- elif self._current == '!':
- yield self._match_or_else('=', 'ne', 'not')
- elif self._current == '=':
- if self._next() == '=':
- yield {'type': 'eq', 'value': '==',
- 'start': self._position - 1, 'end': self._position}
- self._next()
- else:
- if self._current is None:
- # If we're at the EOF, we never advanced
- # the position so we don't need to rewind
- # it back one location.
- position = self._position
- else:
- position = self._position - 1
- raise LexerError(
- lexer_position=position,
- lexer_value='=',
- message="Unknown token '='")
- else:
- raise LexerError(lexer_position=self._position,
- lexer_value=self._current,
- message="Unknown token %s" % self._current)
- yield {'type': 'eof', 'value': '',
- 'start': self._length, 'end': self._length}
-
- def _consume_number(self):
- start = self._position
- buff = self._current
- while self._next() in self.VALID_NUMBER:
- buff += self._current
- return buff
-
- def _initialize_for_expression(self, expression):
- if not expression:
- raise EmptyExpressionError()
- self._position = 0
- self._expression = expression
- self._chars = list(self._expression)
- self._current = self._chars[self._position]
- self._length = len(self._expression)
-
- def _next(self):
- if self._position == self._length - 1:
- self._current = None
- else:
- self._position += 1
- self._current = self._chars[self._position]
- return self._current
-
- def _consume_until(self, delimiter):
- # Consume until the delimiter is reached,
- # allowing for the delimiter to be escaped with "\".
- start = self._position
- buff = ''
- self._next()
- while self._current != delimiter:
- if self._current == '\\':
- buff += '\\'
- self._next()
- if self._current is None:
- # We're at the EOF.
- raise LexerError(lexer_position=start,
- lexer_value=self._expression[start:],
- message="Unclosed %s delimiter" % delimiter)
- buff += self._current
- self._next()
- # Skip the closing delimiter.
- self._next()
- return buff
-
- def _consume_literal(self):
- start = self._position
- lexeme = self._consume_until('`').replace('\\`', '`')
- try:
- # Assume it is valid JSON and attempt to parse.
- parsed_json = loads(lexeme)
- except ValueError:
- try:
- # Invalid JSON values should be converted to quoted
- # JSON strings during the JEP-12 deprecation period.
- parsed_json = loads('"%s"' % lexeme.lstrip())
- warnings.warn("deprecated string literal syntax",
- PendingDeprecationWarning)
- except ValueError:
- raise LexerError(lexer_position=start,
- lexer_value=self._expression[start:],
- message="Bad token %s" % lexeme)
- token_len = self._position - start
- return {'type': 'literal', 'value': parsed_json,
- 'start': start, 'end': token_len}
-
- def _consume_quoted_identifier(self):
- start = self._position
- lexeme = '"' + self._consume_until('"') + '"'
- try:
- token_len = self._position - start
- return {'type': 'quoted_identifier', 'value': loads(lexeme),
- 'start': start, 'end': token_len}
- except ValueError as e:
- error_message = str(e).split(':')[0]
- raise LexerError(lexer_position=start,
- lexer_value=lexeme,
- message=error_message)
-
- def _consume_raw_string_literal(self):
- start = self._position
- lexeme = self._consume_until("'").replace("\\'", "'")
- token_len = self._position - start
- return {'type': 'literal', 'value': lexeme,
- 'start': start, 'end': token_len}
-
- def _match_or_else(self, expected, match_type, else_type):
- start = self._position
- current = self._current
- next_char = self._next()
- if next_char == expected:
- self._next()
- return {'type': match_type, 'value': current + next_char,
- 'start': start, 'end': start + 1}
- return {'type': else_type, 'value': current,
- 'start': start, 'end': start}
|