basabuuka_prototyp/venv/lib/python3.5/site-packages/pdfminer/psparser.py

288 lines
8.5 KiB
Python
Raw Normal View History

2020-08-16 19:36:44 +02:00
import re
import logging
from .utils import choplist
from . import pslexer
STRICT = False
## PS Exceptions
##
class PSException(Exception): pass
class PSEOF(PSException): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
def handle_error(exctype, msg, strict=STRICT):
if strict:
raise exctype(msg)
else:
logging.warning(msg)
## Basic PostScript Types
##
class PSObject:
"""Base class for all PS or PDF-related data types."""
class PSLiteral(PSObject):
"""A class that represents a PostScript literal.
Postscript literals are used as identifiers, such as
variable names, property names and dictionary keys.
Literals are case sensitive and denoted by a preceding
slash sign (e.g. "/Name")
Note: Do not create an instance of PSLiteral directly.
Always use PSLiteralTable.intern().
"""
def __init__(self, name):
self.name = name
def __repr__(self):
return '/%s' % self.name
class PSKeyword(PSObject):
"""A class that represents a PostScript keyword.
PostScript keywords are a dozen of predefined words.
Commands and directives in PostScript are expressed by keywords.
They are also used to denote the content boundaries.
Note: Do not create an instance of PSKeyword directly.
Always use PSKeywordTable.intern().
"""
def __init__(self, name):
self.name = name
def __repr__(self):
return self.name
class PSSymbolTable:
"""A utility class for storing PSLiteral/PSKeyword objects.
Interned objects can be checked its identity with "is" operator.
"""
def __init__(self, klass):
self.dict = {}
self.klass = klass
def intern(self, name):
if name in self.dict:
lit = self.dict[name]
else:
lit = self.klass(name)
self.dict[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
LIT = PSLiteralTable.intern
KWD = PSKeywordTable.intern
KEYWORD_PROC_BEGIN = KWD('{')
KEYWORD_PROC_END = KWD('}')
KEYWORD_ARRAY_BEGIN = KWD('[')
KEYWORD_ARRAY_END = KWD(']')
KEYWORD_DICT_BEGIN = KWD('<<')
KEYWORD_DICT_END = KWD('>>')
def literal_name(x):
if not isinstance(x, PSLiteral):
handle_error(PSTypeError, 'Literal required: %r' % x)
return str(x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
handle_error(PSTypeError, 'Keyword required: %r' % x)
return str(x)
return x.name
## About PSParser, bytes and strings and all that
##
## Most of the contents (well, maybe not in size, but in "parsing effort") of a PDF file is text,
## but in some cases, namely streams, there's binary data involved. What we do is that we read the
## data as latin-1. When binary data is encountered, we have to re-encode it as latin-1 as well.
## About reading all data at once
## There used to be a buffering mechanism in place, but it made everything rather complicated and
## all this string buffering operations, especially with the ply lexer, ended up being rather slow.
## We read the whole thing in memory now. Sure, some PDFs are rather large, but computers today
## have lots of memory. At first, I wanted to use a mmap, but these are binary and making them work
## with the ply lexer was very complicated. Maybe one day.
EOL = re.compile(r'\r\n|\r|\n', re.MULTILINE)
class PSBaseParser:
"""Most basic PostScript parser that performs only tokenization.
"""
def __init__(self, fp):
data = fp.read()
if isinstance(data, bytes):
data = data.decode('latin-1')
self.data = data
self.lex = pslexer.lexer.clone()
self.lex.input(data)
def _convert_token(self, token):
# converts `token` which comes from pslexer to a normal token.
if token.type in {'KEYWORD', 'OPERATOR'}:
if token.value == 'true':
return True
elif token.value == 'false':
return False
else:
return KWD(token.value)
elif token.type == 'LITERAL':
return LIT(token.value)
else:
return token.value
def flush(self):
pass
def close(self):
self.flush()
del self.lex
del self.data
def setpos(self, newpos):
if newpos >= self.lex.lexlen:
raise PSEOF()
self.lex.lexpos = newpos
def nextline(self):
m = EOL.search(self.data, pos=self.lex.lexpos)
if m is None:
raise PSEOF()
start = self.lex.lexpos
s = self.data[start:m.end()]
self.lex.lexpos = m.end()
return (start, s)
def nexttoken(self):
token = self.lex.token()
if token is None:
raise PSEOF()
tokenpos = token.lexpos
return (tokenpos, self._convert_token(token))
class PSStackParser(PSBaseParser):
def __init__(self, fp):
PSBaseParser.__init__(self, fp)
self.reset()
def reset(self):
self.context = []
self.curtype = None
self.curstack = []
self.results = []
def setpos(self, newpos):
PSBaseParser.setpos(self, newpos)
self.reset()
def push(self, *objs):
self.curstack.extend(objs)
def pop(self, n):
objs = self.curstack[-n:]
self.curstack[-n:] = []
return objs
def popall(self):
objs = self.curstack
self.curstack = []
return objs
def add_results(self, *objs):
# logging.debug('add_results: %r', objs)
self.results.extend(objs)
def start_type(self, pos, type):
self.context.append((pos, self.curtype, self.curstack))
(self.curtype, self.curstack) = (type, [])
# logging.debug('start_type: pos=%r, type=%r', pos, type)
def end_type(self, type):
if self.curtype != type:
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
objs = [ obj for (_,obj) in self.curstack ]
(pos, self.curtype, self.curstack) = self.context.pop()
# logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
return (pos, objs)
def do_keyword(self, pos, token):
pass
def nextobject(self):
"""Yields a list of objects.
Returns keywords, literals, strings, numbers, arrays and dictionaries.
Arrays and dictionaries are represented as Python lists and dictionaries.
"""
while not self.results:
(pos, token) = self.nexttoken()
#print (pos,token), (self.curtype, self.curstack)
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
# normal token
self.push((pos, token))
elif token == KEYWORD_ARRAY_BEGIN:
# begin array
self.start_type(pos, 'a')
elif token == KEYWORD_ARRAY_END:
# end array
try:
self.push(self.end_type('a'))
except PSTypeError as e:
handle_error(type(e), str(e))
elif token == KEYWORD_DICT_BEGIN:
# begin dictionary
self.start_type(pos, 'd')
elif token == KEYWORD_DICT_END:
# end dictionary
try:
(pos, objs) = self.end_type('d')
if len(objs) % 2 != 0:
handle_error(PSSyntaxError, 'Invalid dictionary construct: %r' % objs)
# construct a Python dictionary.
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
self.push((pos, d))
except PSTypeError as e:
handle_error(type(e), str(e))
elif token == KEYWORD_PROC_BEGIN:
# begin proc
self.start_type(pos, 'p')
elif token == KEYWORD_PROC_END:
# end proc
try:
self.push(self.end_type('p'))
except PSTypeError as e:
handle_error(type(e), str(e))
else:
logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
self.do_keyword(pos, token)
if self.context:
continue
else:
self.flush()
obj = self.results.pop(0)
logging.debug('nextobject: %r', obj)
return obj