import re
|
|
import logging
|
|
|
|
from .utils import choplist
|
|
from . import pslexer
|
|
|
|
STRICT = False
|
|
|
|
|
|
## PS Exceptions
|
|
##
|
|
class PSException(Exception): pass
|
|
class PSEOF(PSException): pass
|
|
class PSSyntaxError(PSException): pass
|
|
class PSTypeError(PSException): pass
|
|
class PSValueError(PSException): pass
|
|
|
|
def handle_error(exctype, msg, strict=STRICT):
|
|
if strict:
|
|
raise exctype(msg)
|
|
else:
|
|
logging.warning(msg)
|
|
|
|
## Basic PostScript Types
|
|
##
|
|
|
|
class PSObject:
|
|
|
|
"""Base class for all PS or PDF-related data types."""
|
|
|
|
|
|
class PSLiteral(PSObject):
|
|
|
|
"""A class that represents a PostScript literal.
|
|
|
|
Postscript literals are used as identifiers, such as
|
|
variable names, property names and dictionary keys.
|
|
Literals are case sensitive and denoted by a preceding
|
|
slash sign (e.g. "/Name")
|
|
|
|
Note: Do not create an instance of PSLiteral directly.
|
|
Always use PSLiteralTable.intern().
|
|
"""
|
|
|
|
def __init__(self, name):
|
|
self.name = name
|
|
|
|
def __repr__(self):
|
|
return '/%s' % self.name
|
|
|
|
|
|
class PSKeyword(PSObject):
|
|
|
|
"""A class that represents a PostScript keyword.
|
|
|
|
PostScript keywords are a dozen of predefined words.
|
|
Commands and directives in PostScript are expressed by keywords.
|
|
They are also used to denote the content boundaries.
|
|
|
|
Note: Do not create an instance of PSKeyword directly.
|
|
Always use PSKeywordTable.intern().
|
|
"""
|
|
|
|
def __init__(self, name):
|
|
self.name = name
|
|
|
|
def __repr__(self):
|
|
return self.name
|
|
|
|
|
|
class PSSymbolTable:
|
|
|
|
"""A utility class for storing PSLiteral/PSKeyword objects.
|
|
|
|
Interned objects can be checked its identity with "is" operator.
|
|
"""
|
|
|
|
def __init__(self, klass):
|
|
self.dict = {}
|
|
self.klass = klass
|
|
|
|
def intern(self, name):
|
|
if name in self.dict:
|
|
lit = self.dict[name]
|
|
else:
|
|
lit = self.klass(name)
|
|
self.dict[name] = lit
|
|
return lit
|
|
|
|
PSLiteralTable = PSSymbolTable(PSLiteral)
|
|
PSKeywordTable = PSSymbolTable(PSKeyword)
|
|
LIT = PSLiteralTable.intern
|
|
KWD = PSKeywordTable.intern
|
|
KEYWORD_PROC_BEGIN = KWD('{')
|
|
KEYWORD_PROC_END = KWD('}')
|
|
KEYWORD_ARRAY_BEGIN = KWD('[')
|
|
KEYWORD_ARRAY_END = KWD(']')
|
|
KEYWORD_DICT_BEGIN = KWD('<<')
|
|
KEYWORD_DICT_END = KWD('>>')
|
|
|
|
|
|
def literal_name(x):
|
|
if not isinstance(x, PSLiteral):
|
|
handle_error(PSTypeError, 'Literal required: %r' % x)
|
|
return str(x)
|
|
return x.name
|
|
|
|
def keyword_name(x):
|
|
if not isinstance(x, PSKeyword):
|
|
handle_error(PSTypeError, 'Keyword required: %r' % x)
|
|
return str(x)
|
|
return x.name
|
|
|
|
|
|
## About PSParser, bytes and strings and all that
|
|
##
|
|
## Most of the contents (well, maybe not in size, but in "parsing effort") of a PDF file is text,
|
|
## but in some cases, namely streams, there's binary data involved. What we do is that we read the
|
|
## data as latin-1. When binary data is encountered, we have to re-encode it as latin-1 as well.
|
|
|
|
## About reading all data at once
|
|
## There used to be a buffering mechanism in place, but it made everything rather complicated and
|
|
## all this string buffering operations, especially with the ply lexer, ended up being rather slow.
|
|
## We read the whole thing in memory now. Sure, some PDFs are rather large, but computers today
|
|
## have lots of memory. At first, I wanted to use a mmap, but these are binary and making them work
|
|
## with the ply lexer was very complicated. Maybe one day.
|
|
|
|
EOL = re.compile(r'\r\n|\r|\n', re.MULTILINE)
|
|
class PSBaseParser:
|
|
|
|
"""Most basic PostScript parser that performs only tokenization.
|
|
"""
|
|
def __init__(self, fp):
|
|
data = fp.read()
|
|
if isinstance(data, bytes):
|
|
data = data.decode('latin-1')
|
|
self.data = data
|
|
self.lex = pslexer.lexer.clone()
|
|
self.lex.input(data)
|
|
|
|
def _convert_token(self, token):
|
|
# converts `token` which comes from pslexer to a normal token.
|
|
if token.type in {'KEYWORD', 'OPERATOR'}:
|
|
if token.value == 'true':
|
|
return True
|
|
elif token.value == 'false':
|
|
return False
|
|
else:
|
|
return KWD(token.value)
|
|
elif token.type == 'LITERAL':
|
|
return LIT(token.value)
|
|
else:
|
|
return token.value
|
|
|
|
def flush(self):
|
|
pass
|
|
|
|
def close(self):
|
|
self.flush()
|
|
del self.lex
|
|
del self.data
|
|
|
|
def setpos(self, newpos):
|
|
if newpos >= self.lex.lexlen:
|
|
raise PSEOF()
|
|
self.lex.lexpos = newpos
|
|
|
|
def nextline(self):
|
|
m = EOL.search(self.data, pos=self.lex.lexpos)
|
|
if m is None:
|
|
raise PSEOF()
|
|
start = self.lex.lexpos
|
|
s = self.data[start:m.end()]
|
|
self.lex.lexpos = m.end()
|
|
return (start, s)
|
|
|
|
def nexttoken(self):
|
|
token = self.lex.token()
|
|
if token is None:
|
|
raise PSEOF()
|
|
tokenpos = token.lexpos
|
|
return (tokenpos, self._convert_token(token))
|
|
|
|
|
|
class PSStackParser(PSBaseParser):
|
|
|
|
def __init__(self, fp):
|
|
PSBaseParser.__init__(self, fp)
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.context = []
|
|
self.curtype = None
|
|
self.curstack = []
|
|
self.results = []
|
|
|
|
def setpos(self, newpos):
|
|
PSBaseParser.setpos(self, newpos)
|
|
self.reset()
|
|
|
|
def push(self, *objs):
|
|
self.curstack.extend(objs)
|
|
|
|
def pop(self, n):
|
|
objs = self.curstack[-n:]
|
|
self.curstack[-n:] = []
|
|
return objs
|
|
|
|
def popall(self):
|
|
objs = self.curstack
|
|
self.curstack = []
|
|
return objs
|
|
|
|
def add_results(self, *objs):
|
|
# logging.debug('add_results: %r', objs)
|
|
self.results.extend(objs)
|
|
|
|
def start_type(self, pos, type):
|
|
self.context.append((pos, self.curtype, self.curstack))
|
|
(self.curtype, self.curstack) = (type, [])
|
|
# logging.debug('start_type: pos=%r, type=%r', pos, type)
|
|
|
|
def end_type(self, type):
|
|
if self.curtype != type:
|
|
raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
|
|
objs = [ obj for (_,obj) in self.curstack ]
|
|
(pos, self.curtype, self.curstack) = self.context.pop()
|
|
# logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
|
|
return (pos, objs)
|
|
|
|
def do_keyword(self, pos, token):
|
|
pass
|
|
|
|
def nextobject(self):
|
|
"""Yields a list of objects.
|
|
|
|
Returns keywords, literals, strings, numbers, arrays and dictionaries.
|
|
Arrays and dictionaries are represented as Python lists and dictionaries.
|
|
"""
|
|
while not self.results:
|
|
(pos, token) = self.nexttoken()
|
|
#print (pos,token), (self.curtype, self.curstack)
|
|
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
|
|
# normal token
|
|
self.push((pos, token))
|
|
elif token == KEYWORD_ARRAY_BEGIN:
|
|
# begin array
|
|
self.start_type(pos, 'a')
|
|
elif token == KEYWORD_ARRAY_END:
|
|
# end array
|
|
try:
|
|
self.push(self.end_type('a'))
|
|
except PSTypeError as e:
|
|
handle_error(type(e), str(e))
|
|
elif token == KEYWORD_DICT_BEGIN:
|
|
# begin dictionary
|
|
self.start_type(pos, 'd')
|
|
elif token == KEYWORD_DICT_END:
|
|
# end dictionary
|
|
try:
|
|
(pos, objs) = self.end_type('d')
|
|
if len(objs) % 2 != 0:
|
|
handle_error(PSSyntaxError, 'Invalid dictionary construct: %r' % objs)
|
|
# construct a Python dictionary.
|
|
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
|
|
self.push((pos, d))
|
|
except PSTypeError as e:
|
|
handle_error(type(e), str(e))
|
|
elif token == KEYWORD_PROC_BEGIN:
|
|
# begin proc
|
|
self.start_type(pos, 'p')
|
|
elif token == KEYWORD_PROC_END:
|
|
# end proc
|
|
try:
|
|
self.push(self.end_type('p'))
|
|
except PSTypeError as e:
|
|
handle_error(type(e), str(e))
|
|
else:
|
|
logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
|
|
self.do_keyword(pos, token)
|
|
if self.context:
|
|
continue
|
|
else:
|
|
self.flush()
|
|
obj = self.results.pop(0)
|
|
logging.debug('nextobject: %r', obj)
|
|
return obj
|