import re
|
|
import ply.lex as lex
|
|
|
|
states = (
|
|
('instring', 'exclusive'),
|
|
)
|
|
|
|
tokens = (
|
|
'COMMENT', 'HEXSTRING', 'INT', 'FLOAT', 'LITERAL', 'KEYWORD', 'STRING', 'OPERATOR'
|
|
)
|
|
|
|
delimiter = r'\(\)\<\>\[\]\{\}\/\%\s'
|
|
delimiter_end = r'(?=[%s]|$)' % delimiter
|
|
|
|
def t_COMMENT(t):
|
|
# r'^%!.+\n'
|
|
r'%.*\n'
|
|
pass
|
|
|
|
RE_SPC = re.compile(r'\s')
|
|
RE_HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
|
|
@lex.TOKEN(r'<[0-9A-Fa-f\s]*>')
|
|
def t_HEXSTRING(t):
|
|
cleaned = RE_SPC.sub('', t.value[1:-1])
|
|
pairs = RE_HEX_PAIR.findall(cleaned)
|
|
token_bytes = bytes([int(pair, 16) for pair in pairs])
|
|
try:
|
|
t.value = token_bytes.decode('ascii')
|
|
except UnicodeDecodeError:
|
|
# should be kept as bytes
|
|
t.value = token_bytes
|
|
return t
|
|
|
|
@lex.TOKEN(r'(\-|\+)?[0-9]+' + delimiter_end)
|
|
def t_INT(t):
|
|
t.value = int(t.value)
|
|
return t
|
|
|
|
@lex.TOKEN(r'(\-|\+)?([0-9]+\.|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)((e|E)[0-9]+)?' + delimiter_end)
|
|
def t_FLOAT(t):
|
|
t.value = float(t.value)
|
|
return t
|
|
|
|
RE_LITERAL_HEX = re.compile(r'#[0-9A-Fa-f]{2}')
|
|
@lex.TOKEN(r'/.+?' + delimiter_end)
|
|
def t_LITERAL(t):
|
|
newvalue = t.value[1:]
|
|
# If there's '#' chars in the literal, we much de-hex it
|
|
def re_sub(m):
|
|
# convert any hex str to int (without the # char) and the convert that
|
|
return bytes.fromhex(m.group(0)[1:]).decode('latin-1')
|
|
newvalue = RE_LITERAL_HEX.sub(re_sub , newvalue)
|
|
# If there's any lone # char left, remove them
|
|
newvalue = newvalue.replace('#', '')
|
|
t.value = newvalue
|
|
return t
|
|
|
|
def t_OPERATOR(t):
|
|
r'{|}|<<|>>|\[|\]'
|
|
return t
|
|
|
|
t_KEYWORD = r'.+?' + delimiter_end
|
|
|
|
def t_instring(t):
|
|
r'\('
|
|
t.lexer.value_buffer = []
|
|
t.lexer.string_startpos = t.lexpos
|
|
t.lexer.level = 1
|
|
t.lexer.begin('instring')
|
|
|
|
# The parens situation: it's complicated. We can have both escaped parens and unescaped parens.
|
|
# If they're escaped, there's nothing special, we unescape them and add them to the string. If
|
|
# they're not escaped, we have to count how many of them there are, to know when a rparen is the
|
|
# end of the string. The regular expression for this is messed up, so what we do is when we hit
|
|
# a paren, we look if the previous buffer ended up with a backslash. If it did, we don't to paren
|
|
# balancing.
|
|
|
|
def t_instring_lparen(t):
|
|
r'\('
|
|
is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
|
|
if is_escaped:
|
|
t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
|
|
else:
|
|
t.lexer.level +=1
|
|
t.lexer.value_buffer.append('(')
|
|
|
|
def t_instring_rparen(t):
|
|
r'\)'
|
|
is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
|
|
if is_escaped:
|
|
t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
|
|
else:
|
|
t.lexer.level -=1
|
|
|
|
if t.lexer.level == 0:
|
|
t.value = ''.join(t.lexer.value_buffer)
|
|
if any(ord(c) > 0x7f for c in t.value):
|
|
t.value = t.value.encode('latin-1')
|
|
t.type = "STRING"
|
|
t.lexpos = t.lexer.string_startpos
|
|
t.lexer.begin('INITIAL')
|
|
return t
|
|
else:
|
|
t.lexer.value_buffer.append(')')
|
|
|
|
RE_STRING_ESCAPE = re.compile(r'\\[btnfr\\]')
|
|
RE_STRING_OCTAL = re.compile(r'\\[0-7]{1,3}')
|
|
RE_STRING_LINE_CONT = re.compile(r'\\\n|\\\r|\\\r\n')
|
|
ESC_STRING = { 'b': '\b', 't': '\t', 'n': '\n', 'f': '\f', 'r': '\r', '\\': '\\' }
|
|
|
|
def repl_string_escape(m):
|
|
return ESC_STRING[m.group(0)[1]]
|
|
|
|
def repl_string_octal(m):
|
|
i = int(m.group(0)[1:], 8)
|
|
if i < 0xff: # we never want to go above 256 because it's unencodable
|
|
return chr(i)
|
|
else:
|
|
return m.group(0)
|
|
|
|
def t_instring_contents(t):
|
|
r'[^()]+'
|
|
s = t.value
|
|
s = RE_STRING_ESCAPE.sub(repl_string_escape, s)
|
|
s = RE_STRING_OCTAL.sub(repl_string_octal, s)
|
|
s = RE_STRING_LINE_CONT.sub('', s)
|
|
t.lexer.value_buffer.append(s)
|
|
|
|
t_instring_ignore = ''
|
|
t_ignore = ' \t\r\n'
|
|
|
|
# Error handling rule
|
|
def t_error(t):
|
|
print("Illegal character '%r'" % t.value[0])
|
|
t.lexer.skip(1)
|
|
t_instring_error = t_error
|
|
|
|
lexer = lex.lex()
|