alpcentaur
/
basabuuka_prototyp


								import re

								import ply.lex as lex


								states = (

								    ('instring', 'exclusive'),

								)


								tokens = (

								    'COMMENT', 'HEXSTRING', 'INT', 'FLOAT', 'LITERAL', 'KEYWORD', 'STRING', 'OPERATOR'

								)


								delimiter = r'\(\)\<\>\[\]\{\}\/\%\s'

								delimiter_end = r'(?=[%s]|$)' % delimiter


								def t_COMMENT(t):

								    # r'^%!.+\n'

								    r'%.*\n'

								    pass


								RE_SPC = re.compile(r'\s')

								RE_HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')

								@lex.TOKEN(r'<[0-9A-Fa-f\s]*>')

								def t_HEXSTRING(t):

								    cleaned = RE_SPC.sub('', t.value[1:-1])

								    pairs = RE_HEX_PAIR.findall(cleaned)

								    token_bytes = bytes([int(pair, 16) for pair in pairs])

								    try:

								        t.value = token_bytes.decode('ascii')

								    except UnicodeDecodeError:

								        # should be kept as bytes

								        t.value = token_bytes

								    return t


								@lex.TOKEN(r'(\-|\+)?[0-9]+' + delimiter_end)

								def t_INT(t):

								    t.value = int(t.value)

								    return t


								@lex.TOKEN(r'(\-|\+)?([0-9]+\.|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)((e|E)[0-9]+)?' + delimiter_end)

								def t_FLOAT(t):

								    t.value = float(t.value)

								    return t


								RE_LITERAL_HEX = re.compile(r'#[0-9A-Fa-f]{2}')

								@lex.TOKEN(r'/.+?' + delimiter_end)

								def t_LITERAL(t):

								    newvalue = t.value[1:]

								    # If there's '#' chars in the literal, we much de-hex it

								    def re_sub(m):

								        # convert any hex str to int (without the # char) and the convert that

								        return bytes.fromhex(m.group(0)[1:]).decode('latin-1')

								    newvalue = RE_LITERAL_HEX.sub(re_sub , newvalue)

								    # If there's any lone # char left, remove them

								    newvalue = newvalue.replace('#', '')

								    t.value = newvalue

								    return t


								def t_OPERATOR(t):

								    r'{|}|<<|>>|\[|\]'

								    return t


								t_KEYWORD = r'.+?' + delimiter_end


								def t_instring(t):

								    r'\('

								    t.lexer.value_buffer = []

								    t.lexer.string_startpos = t.lexpos

								    t.lexer.level = 1

								    t.lexer.begin('instring')


								# The parens situation: it's complicated. We can have both escaped parens and unescaped parens.

								# If they're escaped, there's nothing special, we unescape them and add them to the string. If

								# they're not escaped, we have to count how many of them there are, to know when a rparen is the

								# end of the string. The regular expression for this is messed up, so what we do is when we hit

								# a paren, we look if the previous buffer ended up with a backslash. If it did, we don't to paren

								# balancing.


								def t_instring_lparen(t):

								    r'\('

								    is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')

								    if is_escaped:

								        t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]

								    else:

								        t.lexer.level +=1

								    t.lexer.value_buffer.append('(')


								def t_instring_rparen(t):

								    r'\)'

								    is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')

								    if is_escaped:

								        t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]

								    else:

								        t.lexer.level -=1


								    if t.lexer.level == 0:

								         t.value = ''.join(t.lexer.value_buffer)

								         if any(ord(c) > 0x7f for c in t.value):

								             t.value = t.value.encode('latin-1')

								         t.type = "STRING"

								         t.lexpos = t.lexer.string_startpos

								         t.lexer.begin('INITIAL')

								         return t

								    else:

								        t.lexer.value_buffer.append(')')


								RE_STRING_ESCAPE = re.compile(r'\\[btnfr\\]')

								RE_STRING_OCTAL = re.compile(r'\\[0-7]{1,3}')

								RE_STRING_LINE_CONT = re.compile(r'\\\n|\\\r|\\\r\n')

								ESC_STRING = { 'b': '\b', 't': '\t', 'n': '\n', 'f': '\f', 'r': '\r', '\\': '\\' }


								def repl_string_escape(m):

								    return ESC_STRING[m.group(0)[1]]


								def repl_string_octal(m):

								    i = int(m.group(0)[1:], 8)

								    if i < 0xff: # we never want to go above 256 because it's unencodable

								        return chr(i)

								    else:

								        return m.group(0)


								def t_instring_contents(t):

								    r'[^()]+'

								    s = t.value

								    s = RE_STRING_ESCAPE.sub(repl_string_escape, s)

								    s = RE_STRING_OCTAL.sub(repl_string_octal, s)

								    s = RE_STRING_LINE_CONT.sub('', s)

								    t.lexer.value_buffer.append(s)


								t_instring_ignore = ''

								t_ignore = ' \t\r\n'


								# Error handling rule

								def t_error(t):

								    print("Illegal character '%r'" % t.value[0])

								    t.lexer.skip(1)

								t_instring_error = t_error


								lexer = lex.lex()