527 lines
19 KiB
Python
527 lines
19 KiB
Python
"""Top down operator precedence parser.
|
|
|
|
This is an implementation of Vaughan R. Pratt's
|
|
"Top Down Operator Precedence" parser.
|
|
(http://dl.acm.org/citation.cfm?doid=512927.512931).
|
|
|
|
These are some additional resources that help explain the
|
|
general idea behind a Pratt parser:
|
|
|
|
* http://effbot.org/zone/simple-top-down-parsing.htm
|
|
* http://javascript.crockford.com/tdop/tdop.html
|
|
|
|
A few notes on the implementation.
|
|
|
|
* All the nud/led tokens are on the Parser class itself, and are dispatched
|
|
using getattr(). This keeps all the parsing logic contained to a single
|
|
class.
|
|
* We use two passes through the data. One to create a list of token,
|
|
then one pass through the tokens to create the AST. While the lexer actually
|
|
yields tokens, we convert it to a list so we can easily implement two tokens
|
|
of lookahead. A previous implementation used a fixed circular buffer, but it
|
|
was significantly slower. Also, the average jmespath expression typically
|
|
does not have a large amount of token so this is not an issue. And
|
|
interestingly enough, creating a token list first is actually faster than
|
|
consuming from the token iterator one token at a time.
|
|
|
|
"""
|
|
import random
|
|
|
|
from jmespath import lexer
|
|
from jmespath.compat import with_repr_method
|
|
from jmespath import ast
|
|
from jmespath import exceptions
|
|
from jmespath import visitor
|
|
|
|
|
|
class Parser(object):
|
|
BINDING_POWER = {
|
|
'eof': 0,
|
|
'unquoted_identifier': 0,
|
|
'quoted_identifier': 0,
|
|
'literal': 0,
|
|
'rbracket': 0,
|
|
'rparen': 0,
|
|
'comma': 0,
|
|
'rbrace': 0,
|
|
'number': 0,
|
|
'current': 0,
|
|
'expref': 0,
|
|
'colon': 0,
|
|
'pipe': 1,
|
|
'or': 2,
|
|
'and': 3,
|
|
'eq': 5,
|
|
'gt': 5,
|
|
'lt': 5,
|
|
'gte': 5,
|
|
'lte': 5,
|
|
'ne': 5,
|
|
'flatten': 9,
|
|
# Everything above stops a projection.
|
|
'star': 20,
|
|
'filter': 21,
|
|
'dot': 40,
|
|
'not': 45,
|
|
'lbrace': 50,
|
|
'lbracket': 55,
|
|
'lparen': 60,
|
|
}
|
|
# The maximum binding power for a token that can stop
|
|
# a projection.
|
|
_PROJECTION_STOP = 10
|
|
# The _MAX_SIZE most recent expressions are cached in
|
|
# _CACHE dict.
|
|
_CACHE = {}
|
|
_MAX_SIZE = 128
|
|
|
|
def __init__(self, lookahead=2):
|
|
self.tokenizer = None
|
|
self._tokens = [None] * lookahead
|
|
self._buffer_size = lookahead
|
|
self._index = 0
|
|
|
|
def parse(self, expression):
|
|
cached = self._CACHE.get(expression)
|
|
if cached is not None:
|
|
return cached
|
|
parsed_result = self._do_parse(expression)
|
|
self._CACHE[expression] = parsed_result
|
|
if len(self._CACHE) > self._MAX_SIZE:
|
|
self._free_cache_entries()
|
|
return parsed_result
|
|
|
|
def _do_parse(self, expression):
|
|
try:
|
|
return self._parse(expression)
|
|
except exceptions.LexerError as e:
|
|
e.expression = expression
|
|
raise
|
|
except exceptions.IncompleteExpressionError as e:
|
|
e.set_expression(expression)
|
|
raise
|
|
except exceptions.ParseError as e:
|
|
e.expression = expression
|
|
raise
|
|
|
|
def _parse(self, expression):
|
|
self.tokenizer = lexer.Lexer().tokenize(expression)
|
|
self._tokens = list(self.tokenizer)
|
|
self._index = 0
|
|
parsed = self._expression(binding_power=0)
|
|
if not self._current_token() == 'eof':
|
|
t = self._lookahead_token(0)
|
|
raise exceptions.ParseError(t['start'], t['value'], t['type'],
|
|
"Unexpected token: %s" % t['value'])
|
|
return ParsedResult(expression, parsed)
|
|
|
|
def _expression(self, binding_power=0):
|
|
left_token = self._lookahead_token(0)
|
|
self._advance()
|
|
nud_function = getattr(
|
|
self, '_token_nud_%s' % left_token['type'],
|
|
self._error_nud_token)
|
|
left = nud_function(left_token)
|
|
current_token = self._current_token()
|
|
while binding_power < self.BINDING_POWER[current_token]:
|
|
led = getattr(self, '_token_led_%s' % current_token, None)
|
|
if led is None:
|
|
error_token = self._lookahead_token(0)
|
|
self._error_led_token(error_token)
|
|
else:
|
|
self._advance()
|
|
left = led(left)
|
|
current_token = self._current_token()
|
|
return left
|
|
|
|
def _token_nud_literal(self, token):
|
|
return ast.literal(token['value'])
|
|
|
|
def _token_nud_unquoted_identifier(self, token):
|
|
return ast.field(token['value'])
|
|
|
|
def _token_nud_quoted_identifier(self, token):
|
|
field = ast.field(token['value'])
|
|
# You can't have a quoted identifier as a function
|
|
# name.
|
|
if self._current_token() == 'lparen':
|
|
t = self._lookahead_token(0)
|
|
raise exceptions.ParseError(
|
|
0, t['value'], t['type'],
|
|
'Quoted identifier not allowed for function names.')
|
|
return field
|
|
|
|
def _token_nud_star(self, token):
|
|
left = ast.identity()
|
|
if self._current_token() == 'rbracket':
|
|
right = ast.identity()
|
|
else:
|
|
right = self._parse_projection_rhs(self.BINDING_POWER['star'])
|
|
return ast.value_projection(left, right)
|
|
|
|
def _token_nud_filter(self, token):
|
|
return self._token_led_filter(ast.identity())
|
|
|
|
def _token_nud_lbrace(self, token):
|
|
return self._parse_multi_select_hash()
|
|
|
|
def _token_nud_lparen(self, token):
|
|
expression = self._expression()
|
|
self._match('rparen')
|
|
return expression
|
|
|
|
def _token_nud_flatten(self, token):
|
|
left = ast.flatten(ast.identity())
|
|
right = self._parse_projection_rhs(
|
|
self.BINDING_POWER['flatten'])
|
|
return ast.projection(left, right)
|
|
|
|
def _token_nud_not(self, token):
|
|
expr = self._expression(self.BINDING_POWER['not'])
|
|
return ast.not_expression(expr)
|
|
|
|
def _token_nud_lbracket(self, token):
|
|
if self._current_token() in ['number', 'colon']:
|
|
right = self._parse_index_expression()
|
|
# We could optimize this and remove the identity() node.
|
|
# We don't really need an index_expression node, we can
|
|
# just use emit an index node here if we're not dealing
|
|
# with a slice.
|
|
return self._project_if_slice(ast.identity(), right)
|
|
elif self._current_token() == 'star' and \
|
|
self._lookahead(1) == 'rbracket':
|
|
self._advance()
|
|
self._advance()
|
|
right = self._parse_projection_rhs(self.BINDING_POWER['star'])
|
|
return ast.projection(ast.identity(), right)
|
|
else:
|
|
return self._parse_multi_select_list()
|
|
|
|
def _parse_index_expression(self):
|
|
# We're here:
|
|
# [<current>
|
|
# ^
|
|
# | current token
|
|
if (self._lookahead(0) == 'colon' or
|
|
self._lookahead(1) == 'colon'):
|
|
return self._parse_slice_expression()
|
|
else:
|
|
# Parse the syntax [number]
|
|
node = ast.index(self._lookahead_token(0)['value'])
|
|
self._advance()
|
|
self._match('rbracket')
|
|
return node
|
|
|
|
def _parse_slice_expression(self):
|
|
# [start:end:step]
|
|
# Where start, end, and step are optional.
|
|
# The last colon is optional as well.
|
|
parts = [None, None, None]
|
|
index = 0
|
|
current_token = self._current_token()
|
|
while not current_token == 'rbracket' and index < 3:
|
|
if current_token == 'colon':
|
|
index += 1
|
|
if index == 3:
|
|
self._raise_parse_error_for_token(
|
|
self._lookahead_token(0), 'syntax error')
|
|
self._advance()
|
|
elif current_token == 'number':
|
|
parts[index] = self._lookahead_token(0)['value']
|
|
self._advance()
|
|
else:
|
|
self._raise_parse_error_for_token(
|
|
self._lookahead_token(0), 'syntax error')
|
|
current_token = self._current_token()
|
|
self._match('rbracket')
|
|
return ast.slice(*parts)
|
|
|
|
def _token_nud_current(self, token):
|
|
return ast.current_node()
|
|
|
|
def _token_nud_expref(self, token):
|
|
expression = self._expression(self.BINDING_POWER['expref'])
|
|
return ast.expref(expression)
|
|
|
|
def _token_led_dot(self, left):
|
|
if not self._current_token() == 'star':
|
|
right = self._parse_dot_rhs(self.BINDING_POWER['dot'])
|
|
if left['type'] == 'subexpression':
|
|
left['children'].append(right)
|
|
return left
|
|
else:
|
|
return ast.subexpression([left, right])
|
|
else:
|
|
# We're creating a projection.
|
|
self._advance()
|
|
right = self._parse_projection_rhs(
|
|
self.BINDING_POWER['dot'])
|
|
return ast.value_projection(left, right)
|
|
|
|
def _token_led_pipe(self, left):
|
|
right = self._expression(self.BINDING_POWER['pipe'])
|
|
return ast.pipe(left, right)
|
|
|
|
def _token_led_or(self, left):
|
|
right = self._expression(self.BINDING_POWER['or'])
|
|
return ast.or_expression(left, right)
|
|
|
|
def _token_led_and(self, left):
|
|
right = self._expression(self.BINDING_POWER['and'])
|
|
return ast.and_expression(left, right)
|
|
|
|
def _token_led_lparen(self, left):
|
|
if left['type'] != 'field':
|
|
# 0 - first func arg or closing paren.
|
|
# -1 - '(' token
|
|
# -2 - invalid function "name".
|
|
prev_t = self._lookahead_token(-2)
|
|
raise exceptions.ParseError(
|
|
prev_t['start'], prev_t['value'], prev_t['type'],
|
|
"Invalid function name '%s'" % prev_t['value'])
|
|
name = left['value']
|
|
args = []
|
|
while not self._current_token() == 'rparen':
|
|
expression = self._expression()
|
|
if self._current_token() == 'comma':
|
|
self._match('comma')
|
|
args.append(expression)
|
|
self._match('rparen')
|
|
function_node = ast.function_expression(name, args)
|
|
return function_node
|
|
|
|
def _token_led_filter(self, left):
|
|
# Filters are projections.
|
|
condition = self._expression(0)
|
|
self._match('rbracket')
|
|
if self._current_token() == 'flatten':
|
|
right = ast.identity()
|
|
else:
|
|
right = self._parse_projection_rhs(self.BINDING_POWER['filter'])
|
|
return ast.filter_projection(left, right, condition)
|
|
|
|
def _token_led_eq(self, left):
|
|
return self._parse_comparator(left, 'eq')
|
|
|
|
def _token_led_ne(self, left):
|
|
return self._parse_comparator(left, 'ne')
|
|
|
|
def _token_led_gt(self, left):
|
|
return self._parse_comparator(left, 'gt')
|
|
|
|
def _token_led_gte(self, left):
|
|
return self._parse_comparator(left, 'gte')
|
|
|
|
def _token_led_lt(self, left):
|
|
return self._parse_comparator(left, 'lt')
|
|
|
|
def _token_led_lte(self, left):
|
|
return self._parse_comparator(left, 'lte')
|
|
|
|
def _token_led_flatten(self, left):
|
|
left = ast.flatten(left)
|
|
right = self._parse_projection_rhs(
|
|
self.BINDING_POWER['flatten'])
|
|
return ast.projection(left, right)
|
|
|
|
def _token_led_lbracket(self, left):
|
|
token = self._lookahead_token(0)
|
|
if token['type'] in ['number', 'colon']:
|
|
right = self._parse_index_expression()
|
|
if left['type'] == 'index_expression':
|
|
# Optimization: if the left node is an index expr,
|
|
# we can avoid creating another node and instead just add
|
|
# the right node as a child of the left.
|
|
left['children'].append(right)
|
|
return left
|
|
else:
|
|
return self._project_if_slice(left, right)
|
|
else:
|
|
# We have a projection
|
|
self._match('star')
|
|
self._match('rbracket')
|
|
right = self._parse_projection_rhs(self.BINDING_POWER['star'])
|
|
return ast.projection(left, right)
|
|
|
|
def _project_if_slice(self, left, right):
|
|
index_expr = ast.index_expression([left, right])
|
|
if right['type'] == 'slice':
|
|
return ast.projection(
|
|
index_expr,
|
|
self._parse_projection_rhs(self.BINDING_POWER['star']))
|
|
else:
|
|
return index_expr
|
|
|
|
def _parse_comparator(self, left, comparator):
|
|
right = self._expression(self.BINDING_POWER[comparator])
|
|
return ast.comparator(comparator, left, right)
|
|
|
|
def _parse_multi_select_list(self):
|
|
expressions = []
|
|
while True:
|
|
expression = self._expression()
|
|
expressions.append(expression)
|
|
if self._current_token() == 'rbracket':
|
|
break
|
|
else:
|
|
self._match('comma')
|
|
self._match('rbracket')
|
|
return ast.multi_select_list(expressions)
|
|
|
|
def _parse_multi_select_hash(self):
|
|
pairs = []
|
|
while True:
|
|
key_token = self._lookahead_token(0)
|
|
# Before getting the token value, verify it's
|
|
# an identifier.
|
|
self._match_multiple_tokens(
|
|
token_types=['quoted_identifier', 'unquoted_identifier'])
|
|
key_name = key_token['value']
|
|
self._match('colon')
|
|
value = self._expression(0)
|
|
node = ast.key_val_pair(key_name=key_name, node=value)
|
|
pairs.append(node)
|
|
if self._current_token() == 'comma':
|
|
self._match('comma')
|
|
elif self._current_token() == 'rbrace':
|
|
self._match('rbrace')
|
|
break
|
|
return ast.multi_select_dict(nodes=pairs)
|
|
|
|
def _parse_projection_rhs(self, binding_power):
|
|
# Parse the right hand side of the projection.
|
|
if self.BINDING_POWER[self._current_token()] < self._PROJECTION_STOP:
|
|
# BP of 10 are all the tokens that stop a projection.
|
|
right = ast.identity()
|
|
elif self._current_token() == 'lbracket':
|
|
right = self._expression(binding_power)
|
|
elif self._current_token() == 'filter':
|
|
right = self._expression(binding_power)
|
|
elif self._current_token() == 'dot':
|
|
self._match('dot')
|
|
right = self._parse_dot_rhs(binding_power)
|
|
else:
|
|
self._raise_parse_error_for_token(self._lookahead_token(0),
|
|
'syntax error')
|
|
return right
|
|
|
|
def _parse_dot_rhs(self, binding_power):
|
|
# From the grammar:
|
|
# expression '.' ( identifier /
|
|
# multi-select-list /
|
|
# multi-select-hash /
|
|
# function-expression /
|
|
# *
|
|
# In terms of tokens that means that after a '.',
|
|
# you can have:
|
|
lookahead = self._current_token()
|
|
# Common case "foo.bar", so first check for an identifier.
|
|
if lookahead in ['quoted_identifier', 'unquoted_identifier', 'star']:
|
|
return self._expression(binding_power)
|
|
elif lookahead == 'lbracket':
|
|
self._match('lbracket')
|
|
return self._parse_multi_select_list()
|
|
elif lookahead == 'lbrace':
|
|
self._match('lbrace')
|
|
return self._parse_multi_select_hash()
|
|
else:
|
|
t = self._lookahead_token(0)
|
|
allowed = ['quoted_identifier', 'unquoted_identifier',
|
|
'lbracket', 'lbrace']
|
|
msg = (
|
|
"Expecting: %s, got: %s" % (allowed, t['type'])
|
|
)
|
|
self._raise_parse_error_for_token(t, msg)
|
|
|
|
def _error_nud_token(self, token):
|
|
if token['type'] == 'eof':
|
|
raise exceptions.IncompleteExpressionError(
|
|
token['start'], token['value'], token['type'])
|
|
self._raise_parse_error_for_token(token, 'invalid token')
|
|
|
|
def _error_led_token(self, token):
|
|
self._raise_parse_error_for_token(token, 'invalid token')
|
|
|
|
def _match(self, token_type=None):
|
|
# inline'd self._current_token()
|
|
if self._current_token() == token_type:
|
|
# inline'd self._advance()
|
|
self._advance()
|
|
else:
|
|
self._raise_parse_error_maybe_eof(
|
|
token_type, self._lookahead_token(0))
|
|
|
|
def _match_multiple_tokens(self, token_types):
|
|
if self._current_token() not in token_types:
|
|
self._raise_parse_error_maybe_eof(
|
|
token_types, self._lookahead_token(0))
|
|
self._advance()
|
|
|
|
def _advance(self):
|
|
self._index += 1
|
|
|
|
def _current_token(self):
|
|
return self._tokens[self._index]['type']
|
|
|
|
def _lookahead(self, number):
|
|
return self._tokens[self._index + number]['type']
|
|
|
|
def _lookahead_token(self, number):
|
|
return self._tokens[self._index + number]
|
|
|
|
def _raise_parse_error_for_token(self, token, reason):
|
|
lex_position = token['start']
|
|
actual_value = token['value']
|
|
actual_type = token['type']
|
|
raise exceptions.ParseError(lex_position, actual_value,
|
|
actual_type, reason)
|
|
|
|
def _raise_parse_error_maybe_eof(self, expected_type, token):
|
|
lex_position = token['start']
|
|
actual_value = token['value']
|
|
actual_type = token['type']
|
|
if actual_type == 'eof':
|
|
raise exceptions.IncompleteExpressionError(
|
|
lex_position, actual_value, actual_type)
|
|
message = 'Expecting: %s, got: %s' % (expected_type,
|
|
actual_type)
|
|
raise exceptions.ParseError(
|
|
lex_position, actual_value, actual_type, message)
|
|
|
|
def _free_cache_entries(self):
|
|
for key in random.sample(self._CACHE.keys(), int(self._MAX_SIZE / 2)):
|
|
del self._CACHE[key]
|
|
|
|
@classmethod
|
|
def purge(cls):
|
|
"""Clear the expression compilation cache."""
|
|
cls._CACHE.clear()
|
|
|
|
|
|
@with_repr_method
|
|
class ParsedResult(object):
|
|
def __init__(self, expression, parsed):
|
|
self.expression = expression
|
|
self.parsed = parsed
|
|
|
|
def search(self, value, options=None):
|
|
interpreter = visitor.TreeInterpreter(options)
|
|
result = interpreter.visit(self.parsed, value)
|
|
return result
|
|
|
|
def _render_dot_file(self):
|
|
"""Render the parsed AST as a dot file.
|
|
|
|
Note that this is marked as an internal method because
|
|
the AST is an implementation detail and is subject
|
|
to change. This method can be used to help troubleshoot
|
|
or for development purposes, but is not considered part
|
|
of the public supported API. Use at your own risk.
|
|
|
|
"""
|
|
renderer = visitor.GraphvizVisitor()
|
|
contents = renderer.visit(self.parsed)
|
|
return contents
|
|
|
|
def __repr__(self):
|
|
return repr(self.parsed)
|