basabuuka_prototyp/venv/lib/python3.5/site-packages/pdfminer/pslexer.py

138 lines
4 KiB
Python
Raw Normal View History

2020-08-16 19:36:44 +02:00
import re
import ply.lex as lex
states = (
('instring', 'exclusive'),
)
tokens = (
'COMMENT', 'HEXSTRING', 'INT', 'FLOAT', 'LITERAL', 'KEYWORD', 'STRING', 'OPERATOR'
)
delimiter = r'\(\)\<\>\[\]\{\}\/\%\s'
delimiter_end = r'(?=[%s]|$)' % delimiter
def t_COMMENT(t):
# r'^%!.+\n'
r'%.*\n'
pass
RE_SPC = re.compile(r'\s')
RE_HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.')
@lex.TOKEN(r'<[0-9A-Fa-f\s]*>')
def t_HEXSTRING(t):
cleaned = RE_SPC.sub('', t.value[1:-1])
pairs = RE_HEX_PAIR.findall(cleaned)
token_bytes = bytes([int(pair, 16) for pair in pairs])
try:
t.value = token_bytes.decode('ascii')
except UnicodeDecodeError:
# should be kept as bytes
t.value = token_bytes
return t
@lex.TOKEN(r'(\-|\+)?[0-9]+' + delimiter_end)
def t_INT(t):
t.value = int(t.value)
return t
@lex.TOKEN(r'(\-|\+)?([0-9]+\.|[0-9]*\.[0-9]+|[0-9]+\.[0-9]*)((e|E)[0-9]+)?' + delimiter_end)
def t_FLOAT(t):
t.value = float(t.value)
return t
RE_LITERAL_HEX = re.compile(r'#[0-9A-Fa-f]{2}')
@lex.TOKEN(r'/.+?' + delimiter_end)
def t_LITERAL(t):
newvalue = t.value[1:]
# If there's '#' chars in the literal, we much de-hex it
def re_sub(m):
# convert any hex str to int (without the # char) and the convert that
return bytes.fromhex(m.group(0)[1:]).decode('latin-1')
newvalue = RE_LITERAL_HEX.sub(re_sub , newvalue)
# If there's any lone # char left, remove them
newvalue = newvalue.replace('#', '')
t.value = newvalue
return t
def t_OPERATOR(t):
r'{|}|<<|>>|\[|\]'
return t
t_KEYWORD = r'.+?' + delimiter_end
def t_instring(t):
r'\('
t.lexer.value_buffer = []
t.lexer.string_startpos = t.lexpos
t.lexer.level = 1
t.lexer.begin('instring')
# The parens situation: it's complicated. We can have both escaped parens and unescaped parens.
# If they're escaped, there's nothing special, we unescape them and add them to the string. If
# they're not escaped, we have to count how many of them there are, to know when a rparen is the
# end of the string. The regular expression for this is messed up, so what we do is when we hit
# a paren, we look if the previous buffer ended up with a backslash. If it did, we don't to paren
# balancing.
def t_instring_lparen(t):
r'\('
is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
if is_escaped:
t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
else:
t.lexer.level +=1
t.lexer.value_buffer.append('(')
def t_instring_rparen(t):
r'\)'
is_escaped = t.lexer.value_buffer and t.lexer.value_buffer[-1].endswith('\\')
if is_escaped:
t.lexer.value_buffer[-1] = t.lexer.value_buffer[-1][:-1]
else:
t.lexer.level -=1
if t.lexer.level == 0:
t.value = ''.join(t.lexer.value_buffer)
if any(ord(c) > 0x7f for c in t.value):
t.value = t.value.encode('latin-1')
t.type = "STRING"
t.lexpos = t.lexer.string_startpos
t.lexer.begin('INITIAL')
return t
else:
t.lexer.value_buffer.append(')')
RE_STRING_ESCAPE = re.compile(r'\\[btnfr\\]')
RE_STRING_OCTAL = re.compile(r'\\[0-7]{1,3}')
RE_STRING_LINE_CONT = re.compile(r'\\\n|\\\r|\\\r\n')
ESC_STRING = { 'b': '\b', 't': '\t', 'n': '\n', 'f': '\f', 'r': '\r', '\\': '\\' }
def repl_string_escape(m):
return ESC_STRING[m.group(0)[1]]
def repl_string_octal(m):
i = int(m.group(0)[1:], 8)
if i < 0xff: # we never want to go above 256 because it's unencodable
return chr(i)
else:
return m.group(0)
def t_instring_contents(t):
r'[^()]+'
s = t.value
s = RE_STRING_ESCAPE.sub(repl_string_escape, s)
s = RE_STRING_OCTAL.sub(repl_string_octal, s)
s = RE_STRING_LINE_CONT.sub('', s)
t.lexer.value_buffer.append(s)
t_instring_ignore = ''
t_ignore = ' \t\r\n'
# Error handling rule
def t_error(t):
print("Illegal character '%r'" % t.value[0])
t.lexer.skip(1)
t_instring_error = t_error
lexer = lex.lex()