You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
4.0 KiB

4 years ago
  1. import re
  2. import ply.lex as lex
  3. states = (
  4. ('instring', 'exclusive'),
  5. )
  6. tokens = (
  7. 'COMMENT', 'HEXSTRING', 'INT', 'FLOAT', 'LITERAL', 'KEYWORD', 'STRING', 'OPERATOR'
  8. )
  9. delimiter = r'\(\)\<\>\[\]\{\}\/\%\s'
  10. delimiter_end = r'(?=[%s]|$)' % delimiter
  11. def t_COMMENT(t):
  12. # r'^%!.+\n'
  13. r'%.*\n'
  14. pass
  15. RE_SPC = re.compile(r'\s')
  16. RE_HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
  17. @lex.TOKEN(r'<[0-9A-Fa-f\s]*>')
  18. def t_HEXSTRING(t):
  19. cleaned = RE_SPC.sub('', t.value[1:-1])
  20. pairs = RE_HEX_PAIR.findall(cleaned)
  21. token_bytes = bytes([int(pair, 16) for pair in pairs])
  22. try:
  23. t.value = token_bytes.decode('ascii')
  24. except UnicodeDecodeError:
  25. # should be kept as bytes
  26. t.value = token_bytes
  27. return t
  28. @lex.TOKEN(r'(\-|\+)?[0-9]+' + delimiter_end)
  29. def t_INT(t):
  30. t.value = int(t.value)
  31. return t
  32. @lex.TOKEN(r'(\-|\+)?([0-9]+\.|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)((e|E)[0-9]+)?' + delimiter_end)
  33. def t_FLOAT(t):
  34. t.value = float(t.value)
  35. return t
  36. RE_LITERAL_HEX = re.compile(r'#[0-9A-Fa-f]{2}')
  37. @lex.TOKEN(r'/.+?' + delimiter_end)
  38. def t_LITERAL(t):
  39. newvalue = t.value[1:]
  40. # If there's '#' chars in the literal, we much de-hex it
  41. def re_sub(m):
  42. # convert any hex str to int (without the # char) and the convert that
  43. return bytes.fromhex(m.group(0)[1:]).decode('latin-1')
  44. newvalue = RE_LITERAL_HEX.sub(re_sub , newvalue)
  45. # If there's any lone # char left, remove them
  46. newvalue = newvalue.replace('#', '')
  47. t.value = newvalue
  48. return t
  49. def t_OPERATOR(t):
  50. r'{|}|<<|>>|\[|\]'
  51. return t
  52. t_KEYWORD = r'.+?' + delimiter_end
  53. def t_instring(t):
  54. r'\('
  55. t.lexer.value_buffer = []
  56. t.lexer.string_startpos = t.lexpos
  57. t.lexer.level = 1
  58. t.lexer.begin('instring')
  59. # The parens situation: it's complicated. We can have both escaped parens and unescaped parens.
  60. # If they're escaped, there's nothing special, we unescape them and add them to the string. If
  61. # they're not escaped, we have to count how many of them there are, to know when a rparen is the
  62. # end of the string. The regular expression for this is messed up, so what we do is when we hit
  63. # a paren, we look if the previous buffer ended up with a backslash. If it did, we don't to paren
  64. # balancing.
  65. def t_instring_lparen(t):
  66. r'\('
  67. is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
  68. if is_escaped:
  69. t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
  70. else:
  71. t.lexer.level +=1
  72. t.lexer.value_buffer.append('(')
  73. def t_instring_rparen(t):
  74. r'\)'
  75. is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
  76. if is_escaped:
  77. t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
  78. else:
  79. t.lexer.level -=1
  80. if t.lexer.level == 0:
  81. t.value = ''.join(t.lexer.value_buffer)
  82. if any(ord(c) > 0x7f for c in t.value):
  83. t.value = t.value.encode('latin-1')
  84. t.type = "STRING"
  85. t.lexpos = t.lexer.string_startpos
  86. t.lexer.begin('INITIAL')
  87. return t
  88. else:
  89. t.lexer.value_buffer.append(')')
  90. RE_STRING_ESCAPE = re.compile(r'\\[btnfr\\]')
  91. RE_STRING_OCTAL = re.compile(r'\\[0-7]{1,3}')
  92. RE_STRING_LINE_CONT = re.compile(r'\\\n|\\\r|\\\r\n')
  93. ESC_STRING = { 'b': '\b', 't': '\t', 'n': '\n', 'f': '\f', 'r': '\r', '\\': '\\' }
  94. def repl_string_escape(m):
  95. return ESC_STRING[m.group(0)[1]]
  96. def repl_string_octal(m):
  97. i = int(m.group(0)[1:], 8)
  98. if i < 0xff: # we never want to go above 256 because it's unencodable
  99. return chr(i)
  100. else:
  101. return m.group(0)
  102. def t_instring_contents(t):
  103. r'[^()]+'
  104. s = t.value
  105. s = RE_STRING_ESCAPE.sub(repl_string_escape, s)
  106. s = RE_STRING_OCTAL.sub(repl_string_octal, s)
  107. s = RE_STRING_LINE_CONT.sub('', s)
  108. t.lexer.value_buffer.append(s)
  109. t_instring_ignore = ''
  110. t_ignore = ' \t\r\n'
  111. # Error handling rule
  112. def t_error(t):
  113. print("Illegal character '%r'" % t.value[0])
  114. t.lexer.skip(1)
  115. t_instring_error = t_error
  116. lexer = lex.lex()