You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

287 lines
8.5 KiB

4 years ago
  1. import re
  2. import logging
  3. from .utils import choplist
  4. from . import pslexer
  5. STRICT = False
  6. ## PS Exceptions
  7. ##
  8. class PSException(Exception): pass
  9. class PSEOF(PSException): pass
  10. class PSSyntaxError(PSException): pass
  11. class PSTypeError(PSException): pass
  12. class PSValueError(PSException): pass
  13. def handle_error(exctype, msg, strict=STRICT):
  14. if strict:
  15. raise exctype(msg)
  16. else:
  17. logging.warning(msg)
  18. ## Basic PostScript Types
  19. ##
  20. class PSObject:
  21. """Base class for all PS or PDF-related data types."""
  22. class PSLiteral(PSObject):
  23. """A class that represents a PostScript literal.
  24. Postscript literals are used as identifiers, such as
  25. variable names, property names and dictionary keys.
  26. Literals are case sensitive and denoted by a preceding
  27. slash sign (e.g. "/Name")
  28. Note: Do not create an instance of PSLiteral directly.
  29. Always use PSLiteralTable.intern().
  30. """
  31. def __init__(self, name):
  32. self.name = name
  33. def __repr__(self):
  34. return '/%s' % self.name
  35. class PSKeyword(PSObject):
  36. """A class that represents a PostScript keyword.
  37. PostScript keywords are a dozen of predefined words.
  38. Commands and directives in PostScript are expressed by keywords.
  39. They are also used to denote the content boundaries.
  40. Note: Do not create an instance of PSKeyword directly.
  41. Always use PSKeywordTable.intern().
  42. """
  43. def __init__(self, name):
  44. self.name = name
  45. def __repr__(self):
  46. return self.name
  47. class PSSymbolTable:
  48. """A utility class for storing PSLiteral/PSKeyword objects.
  49. Interned objects can be checked its identity with "is" operator.
  50. """
  51. def __init__(self, klass):
  52. self.dict = {}
  53. self.klass = klass
  54. def intern(self, name):
  55. if name in self.dict:
  56. lit = self.dict[name]
  57. else:
  58. lit = self.klass(name)
  59. self.dict[name] = lit
  60. return lit
  61. PSLiteralTable = PSSymbolTable(PSLiteral)
  62. PSKeywordTable = PSSymbolTable(PSKeyword)
  63. LIT = PSLiteralTable.intern
  64. KWD = PSKeywordTable.intern
  65. KEYWORD_PROC_BEGIN = KWD('{')
  66. KEYWORD_PROC_END = KWD('}')
  67. KEYWORD_ARRAY_BEGIN = KWD('[')
  68. KEYWORD_ARRAY_END = KWD(']')
  69. KEYWORD_DICT_BEGIN = KWD('<<')
  70. KEYWORD_DICT_END = KWD('>>')
  71. def literal_name(x):
  72. if not isinstance(x, PSLiteral):
  73. handle_error(PSTypeError, 'Literal required: %r' % x)
  74. return str(x)
  75. return x.name
  76. def keyword_name(x):
  77. if not isinstance(x, PSKeyword):
  78. handle_error(PSTypeError, 'Keyword required: %r' % x)
  79. return str(x)
  80. return x.name
  81. ## About PSParser, bytes and strings and all that
  82. ##
  83. ## Most of the contents (well, maybe not in size, but in "parsing effort") of a PDF file is text,
  84. ## but in some cases, namely streams, there's binary data involved. What we do is that we read the
  85. ## data as latin-1. When binary data is encountered, we have to re-encode it as latin-1 as well.
  86. ## About reading all data at once
  87. ## There used to be a buffering mechanism in place, but it made everything rather complicated and
  88. ## all this string buffering operations, especially with the ply lexer, ended up being rather slow.
  89. ## We read the whole thing in memory now. Sure, some PDFs are rather large, but computers today
  90. ## have lots of memory. At first, I wanted to use a mmap, but these are binary and making them work
  91. ## with the ply lexer was very complicated. Maybe one day.
  92. EOL = re.compile(r'\r\n|\r|\n', re.MULTILINE)
  93. class PSBaseParser:
  94. """Most basic PostScript parser that performs only tokenization.
  95. """
  96. def __init__(self, fp):
  97. data = fp.read()
  98. if isinstance(data, bytes):
  99. data = data.decode('latin-1')
  100. self.data = data
  101. self.lex = pslexer.lexer.clone()
  102. self.lex.input(data)
  103. def _convert_token(self, token):
  104. # converts `token` which comes from pslexer to a normal token.
  105. if token.type in {'KEYWORD', 'OPERATOR'}:
  106. if token.value == 'true':
  107. return True
  108. elif token.value == 'false':
  109. return False
  110. else:
  111. return KWD(token.value)
  112. elif token.type == 'LITERAL':
  113. return LIT(token.value)
  114. else:
  115. return token.value
  116. def flush(self):
  117. pass
  118. def close(self):
  119. self.flush()
  120. del self.lex
  121. del self.data
  122. def setpos(self, newpos):
  123. if newpos >= self.lex.lexlen:
  124. raise PSEOF()
  125. self.lex.lexpos = newpos
  126. def nextline(self):
  127. m = EOL.search(self.data, pos=self.lex.lexpos)
  128. if m is None:
  129. raise PSEOF()
  130. start = self.lex.lexpos
  131. s = self.data[start:m.end()]
  132. self.lex.lexpos = m.end()
  133. return (start, s)
  134. def nexttoken(self):
  135. token = self.lex.token()
  136. if token is None:
  137. raise PSEOF()
  138. tokenpos = token.lexpos
  139. return (tokenpos, self._convert_token(token))
  140. class PSStackParser(PSBaseParser):
  141. def __init__(self, fp):
  142. PSBaseParser.__init__(self, fp)
  143. self.reset()
  144. def reset(self):
  145. self.context = []
  146. self.curtype = None
  147. self.curstack = []
  148. self.results = []
  149. def setpos(self, newpos):
  150. PSBaseParser.setpos(self, newpos)
  151. self.reset()
  152. def push(self, *objs):
  153. self.curstack.extend(objs)
  154. def pop(self, n):
  155. objs = self.curstack[-n:]
  156. self.curstack[-n:] = []
  157. return objs
  158. def popall(self):
  159. objs = self.curstack
  160. self.curstack = []
  161. return objs
  162. def add_results(self, *objs):
  163. # logging.debug('add_results: %r', objs)
  164. self.results.extend(objs)
  165. def start_type(self, pos, type):
  166. self.context.append((pos, self.curtype, self.curstack))
  167. (self.curtype, self.curstack) = (type, [])
  168. # logging.debug('start_type: pos=%r, type=%r', pos, type)
  169. def end_type(self, type):
  170. if self.curtype != type:
  171. raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type))
  172. objs = [ obj for (_,obj) in self.curstack ]
  173. (pos, self.curtype, self.curstack) = self.context.pop()
  174. # logging.debug('end_type: pos=%r, type=%r, objs=%r', pos, type, objs)
  175. return (pos, objs)
  176. def do_keyword(self, pos, token):
  177. pass
  178. def nextobject(self):
  179. """Yields a list of objects.
  180. Returns keywords, literals, strings, numbers, arrays and dictionaries.
  181. Arrays and dictionaries are represented as Python lists and dictionaries.
  182. """
  183. while not self.results:
  184. (pos, token) = self.nexttoken()
  185. #print (pos,token), (self.curtype, self.curstack)
  186. if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
  187. # normal token
  188. self.push((pos, token))
  189. elif token == KEYWORD_ARRAY_BEGIN:
  190. # begin array
  191. self.start_type(pos, 'a')
  192. elif token == KEYWORD_ARRAY_END:
  193. # end array
  194. try:
  195. self.push(self.end_type('a'))
  196. except PSTypeError as e:
  197. handle_error(type(e), str(e))
  198. elif token == KEYWORD_DICT_BEGIN:
  199. # begin dictionary
  200. self.start_type(pos, 'd')
  201. elif token == KEYWORD_DICT_END:
  202. # end dictionary
  203. try:
  204. (pos, objs) = self.end_type('d')
  205. if len(objs) % 2 != 0:
  206. handle_error(PSSyntaxError, 'Invalid dictionary construct: %r' % objs)
  207. # construct a Python dictionary.
  208. d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) if v is not None )
  209. self.push((pos, d))
  210. except PSTypeError as e:
  211. handle_error(type(e), str(e))
  212. elif token == KEYWORD_PROC_BEGIN:
  213. # begin proc
  214. self.start_type(pos, 'p')
  215. elif token == KEYWORD_PROC_END:
  216. # end proc
  217. try:
  218. self.push(self.end_type('p'))
  219. except PSTypeError as e:
  220. handle_error(type(e), str(e))
  221. else:
  222. logging.debug('do_keyword: pos=%r, token=%r, stack=%r', pos, token, self.curstack)
  223. self.do_keyword(pos, token)
  224. if self.context:
  225. continue
  226. else:
  227. self.flush()
  228. obj = self.results.pop(0)
  229. logging.debug('nextobject: %r', obj)
  230. return obj