import re import ply.lex as lex states = ( ('instring', 'exclusive'), ) tokens = ( 'COMMENT', 'HEXSTRING', 'INT', 'FLOAT', 'LITERAL', 'KEYWORD', 'STRING', 'OPERATOR' ) delimiter = r'\(\)\<\>\[\]\{\}\/\%\s' delimiter_end = r'(?=[%s]|$)' % delimiter def t_COMMENT(t): # r'^%!.+\n' r'%.*\n' pass RE_SPC = re.compile(r'\s') RE_HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') @lex.TOKEN(r'<[0-9A-Fa-f\s]*>') def t_HEXSTRING(t): cleaned = RE_SPC.sub('', t.value[1:-1]) pairs = RE_HEX_PAIR.findall(cleaned) token_bytes = bytes([int(pair, 16) for pair in pairs]) try: t.value = token_bytes.decode('ascii') except UnicodeDecodeError: # should be kept as bytes t.value = token_bytes return t @lex.TOKEN(r'(\-|\+)?[0-9]+' + delimiter_end) def t_INT(t): t.value = int(t.value) return t @lex.TOKEN(r'(\-|\+)?([0-9]+\.|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)((e|E)[0-9]+)?' + delimiter_end) def t_FLOAT(t): t.value = float(t.value) return t RE_LITERAL_HEX = re.compile(r'#[0-9A-Fa-f]{2}') @lex.TOKEN(r'/.+?' + delimiter_end) def t_LITERAL(t): newvalue = t.value[1:] # If there's '#' chars in the literal, we much de-hex it def re_sub(m): # convert any hex str to int (without the # char) and the convert that return bytes.fromhex(m.group(0)[1:]).decode('latin-1') newvalue = RE_LITERAL_HEX.sub(re_sub , newvalue) # If there's any lone # char left, remove them newvalue = newvalue.replace('#', '') t.value = newvalue return t def t_OPERATOR(t): r'{|}|<<|>>|\[|\]' return t t_KEYWORD = r'.+?' + delimiter_end def t_instring(t): r'\(' t.lexer.value_buffer = [] t.lexer.string_startpos = t.lexpos t.lexer.level = 1 t.lexer.begin('instring') # The parens situation: it's complicated. We can have both escaped parens and unescaped parens. # If they're escaped, there's nothing special, we unescape them and add them to the string. If # they're not escaped, we have to count how many of them there are, to know when a rparen is the # end of the string. The regular expression for this is messed up, so what we do is when we hit # a paren, we look if the previous buffer ended up with a backslash. If it did, we don't to paren # balancing. def t_instring_lparen(t): r'\(' is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\') if is_escaped: t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1] else: t.lexer.level +=1 t.lexer.value_buffer.append('(') def t_instring_rparen(t): r'\)' is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\') if is_escaped: t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1] else: t.lexer.level -=1 if t.lexer.level == 0: t.value = ''.join(t.lexer.value_buffer) if any(ord(c) > 0x7f for c in t.value): t.value = t.value.encode('latin-1') t.type = "STRING" t.lexpos = t.lexer.string_startpos t.lexer.begin('INITIAL') return t else: t.lexer.value_buffer.append(')') RE_STRING_ESCAPE = re.compile(r'\\[btnfr\\]') RE_STRING_OCTAL = re.compile(r'\\[0-7]{1,3}') RE_STRING_LINE_CONT = re.compile(r'\\\n|\\\r|\\\r\n') ESC_STRING = { 'b': '\b', 't': '\t', 'n': '\n', 'f': '\f', 'r': '\r', '\\': '\\' } def repl_string_escape(m): return ESC_STRING[m.group(0)[1]] def repl_string_octal(m): i = int(m.group(0)[1:], 8) if i < 0xff: # we never want to go above 256 because it's unencodable return chr(i) else: return m.group(0) def t_instring_contents(t): r'[^()]+' s = t.value s = RE_STRING_ESCAPE.sub(repl_string_escape, s) s = RE_STRING_OCTAL.sub(repl_string_octal, s) s = RE_STRING_LINE_CONT.sub('', s) t.lexer.value_buffer.append(s) t_instring_ignore = '' t_ignore = ' \t\r\n' # Error handling rule def t_error(t): print("Illegal character '%r'" % t.value[0]) t.lexer.skip(1) t_instring_error = t_error lexer = lex.lex()