You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

809 lines
29 KiB

4 years ago
  1. import io
  2. import re
  3. import struct
  4. import hashlib as md5
  5. import logging
  6. from .psparser import PSStackParser, PSSyntaxError, PSEOF, literal_name, LIT, KWD, handle_error
  7. from .pdftypes import (PDFException, PDFTypeError, PDFNotImplementedError, PDFStream, PDFObjRef,
  8. resolve1, decipher_all, int_value, str_value, list_value, dict_value, stream_value)
  9. from .arcfour import Arcfour
  10. from .utils import choplist, nunpack, decode_text, ObjIdRange
  11. logger = logging.getLogger(__name__)
  12. ## Exceptions
  13. ##
  14. class PDFSyntaxError(PDFException): pass
  15. class PDFNoValidXRef(PDFSyntaxError): pass
  16. class PDFNoOutlines(PDFException): pass
  17. class PDFDestinationNotFound(PDFException): pass
  18. class PDFAlreadyParsed(PDFException): pass
  19. class PDFEncryptionError(PDFException): pass
  20. class PDFPasswordIncorrect(PDFEncryptionError): pass
  21. # some predefined literals and keywords.
  22. LITERAL_OBJSTM = LIT('ObjStm')
  23. LITERAL_XREF = LIT('XRef')
  24. LITERAL_PAGE = LIT('Page')
  25. LITERAL_PAGES = LIT('Pages')
  26. LITERAL_CATALOG = LIT('Catalog')
  27. class PDFBaseXRef:
  28. def get_trailer(self):
  29. raise NotImplementedError
  30. def get_objids(self):
  31. return []
  32. def get_pos(self, objid):
  33. raise KeyError(objid)
  34. class PDFXRef(PDFBaseXRef):
  35. def __init__(self):
  36. self.offsets = {}
  37. self.trailer = {}
  38. def load(self, parser):
  39. while 1:
  40. try:
  41. (pos, line) = parser.nextline()
  42. if not line.strip(): continue
  43. except PSEOF:
  44. raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
  45. if not line:
  46. raise PDFNoValidXRef('Premature eof: %r' % parser)
  47. if line.startswith('trailer'):
  48. parser.setpos(pos)
  49. break
  50. f = line.strip().split(' ')
  51. if len(f) != 2:
  52. raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
  53. try:
  54. (start, nobjs) = list(map(int, f))
  55. except ValueError:
  56. raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
  57. for objid in range(start, start+nobjs):
  58. try:
  59. (_, line) = parser.nextline()
  60. except PSEOF:
  61. raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
  62. f = line.strip().split(' ')
  63. if len(f) != 3:
  64. raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
  65. (pos, genno, use) = f
  66. if use != 'n': continue
  67. self.offsets[objid] = (int(genno), int(pos))
  68. logger.debug('xref objects: %r', self.offsets)
  69. self.load_trailer(parser)
  70. KEYWORD_TRAILER = KWD('trailer')
  71. def load_trailer(self, parser):
  72. try:
  73. (_,kwd) = parser.nexttoken()
  74. assert kwd is self.KEYWORD_TRAILER
  75. (_,dic) = parser.nextobject()
  76. except PSEOF:
  77. x = parser.pop(1)
  78. if not x:
  79. raise PDFNoValidXRef('Unexpected EOF - file corrupted')
  80. (_,dic) = x[0]
  81. self.trailer.update(dict_value(dic))
  82. PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
  83. def load_fallback(self, parser, debug=0):
  84. parser.setpos(0)
  85. while 1:
  86. try:
  87. (pos, line) = parser.nextline()
  88. except PSEOF:
  89. break
  90. if line.startswith('trailer'):
  91. parser.setpos(pos)
  92. self.load_trailer(parser)
  93. logger.debug('trailer: %r', self.get_trailer())
  94. break
  95. m = self.PDFOBJ_CUE.match(line)
  96. if not m: continue
  97. (objid, genno) = m.groups()
  98. self.offsets[int(objid)] = (0, pos)
  99. def get_trailer(self):
  100. return self.trailer
  101. def get_objids(self):
  102. return iter(self.offsets.keys())
  103. def get_pos(self, objid):
  104. try:
  105. (genno, pos) = self.offsets[objid]
  106. except KeyError:
  107. raise
  108. return (None, pos)
  109. class PDFXRefStream(PDFBaseXRef):
  110. def __init__(self):
  111. self.data = None
  112. self.entlen = None
  113. self.fl1 = self.fl2 = self.fl3 = None
  114. self.objid_ranges = []
  115. def __repr__(self):
  116. return '<PDFXRefStream: fields=%d,%d,%d>' % (self.fl1, self.fl2, self.fl3)
  117. def load(self, parser):
  118. (_,objid) = parser.nexttoken() # ignored
  119. (_,genno) = parser.nexttoken() # ignored
  120. (_,kwd) = parser.nexttoken()
  121. (_,stream) = parser.nextobject()
  122. if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
  123. raise PDFNoValidXRef('Invalid PDF stream spec.')
  124. size = stream['Size']
  125. index_array = stream.get('Index', (0,size))
  126. if len(index_array) % 2 != 0:
  127. raise PDFSyntaxError('Invalid index number')
  128. self.objid_ranges.extend( ObjIdRange(start, nobjs)
  129. for (start,nobjs) in choplist(2, index_array) )
  130. (self.fl1, self.fl2, self.fl3) = stream['W']
  131. self.data = stream.get_data()
  132. self.entlen = self.fl1+self.fl2+self.fl3
  133. self.trailer = stream.attrs
  134. if logger.getEffectiveLevel() <= logging.DEBUG:
  135. logger.debug('xref stream: objid=%s, fields=%d,%d,%d',
  136. ', '.join(map(repr, self.objid_ranges)), self.fl1, self.fl2, self.fl3)
  137. def get_trailer(self):
  138. return self.trailer
  139. def get_objids(self):
  140. for objid_range in self.objid_ranges:
  141. for x in range(objid_range.get_start_id(), objid_range.get_end_id()+1):
  142. yield x
  143. def get_pos(self, objid):
  144. offset = 0
  145. found = False
  146. for objid_range in self.objid_ranges:
  147. if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
  148. offset += objid - objid_range.get_start_id()
  149. found = True
  150. break
  151. else:
  152. offset += objid_range.get_nobjs()
  153. if not found: raise KeyError(objid)
  154. i = self.entlen * offset
  155. ent = self.data[i:i+self.entlen]
  156. f1 = nunpack(ent[:self.fl1], 1)
  157. if f1 == 1:
  158. pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
  159. genno = nunpack(ent[self.fl1+self.fl2:])
  160. return (None, pos)
  161. elif f1 == 2:
  162. objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
  163. index = nunpack(ent[self.fl1+self.fl2:])
  164. return (objid, index)
  165. # this is a free object
  166. raise KeyError(objid)
  167. class PDFPage:
  168. """An object that holds the information about a page.
  169. A PDFPage object is merely a convenience class that has a set
  170. of keys and values, which describe the properties of a page
  171. and point to its contents.
  172. Attributes:
  173. doc: a PDFDocument object.
  174. pageid: any Python object that can uniquely identify the page.
  175. attrs: a dictionary of page attributes.
  176. contents: a list of PDFStream objects that represents the page content.
  177. lastmod: the last modified time of the page.
  178. resources: a list of resources used by the page.
  179. mediabox: the physical size of the page.
  180. cropbox: the crop rectangle of the page.
  181. rotate: the page rotation (in degree).
  182. annots: the page annotations.
  183. beads: a chain that represents natural reading order.
  184. """
  185. def __init__(self, doc, pageid, attrs):
  186. """Initialize a page object.
  187. doc: a PDFDocument object.
  188. pageid: any Python object that can uniquely identify the page.
  189. attrs: a dictionary of page attributes.
  190. """
  191. self.doc = doc
  192. self.pageid = pageid
  193. self.attrs = dict_value(attrs)
  194. self.lastmod = resolve1(self.attrs.get('LastModified'))
  195. self.resources = resolve1(self.attrs['Resources'])
  196. self.mediabox = resolve1(self.attrs['MediaBox'])
  197. if 'CropBox' in self.attrs:
  198. self.cropbox = resolve1(self.attrs['CropBox'])
  199. else:
  200. self.cropbox = self.mediabox
  201. self.rotate = (self.attrs.get('Rotate', 0)+360) % 360
  202. self.annots = self.attrs.get('Annots')
  203. self.beads = self.attrs.get('B')
  204. if 'Contents' in self.attrs:
  205. contents = resolve1(self.attrs['Contents'])
  206. else:
  207. contents = []
  208. if not isinstance(contents, list):
  209. contents = [ contents ]
  210. self.contents = contents
  211. def __repr__(self):
  212. return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
  213. class PDFDocument:
  214. """PDFDocument object represents a PDF document.
  215. Since a PDF file can be very big, normally it is not loaded at
  216. once. So PDF document has to cooperate with a PDF parser in order to
  217. dynamically import the data as processing goes.
  218. Typical usage:
  219. doc = PDFDocument()
  220. doc.set_parser(parser)
  221. doc.initialize(password)
  222. obj = doc.getobj(objid)
  223. """
  224. KEYWORD_OBJ = KWD('obj')
  225. def __init__(self, caching=True):
  226. self.caching = caching
  227. self.xrefs = []
  228. self.info = []
  229. self.catalog = None
  230. self.encryption = None
  231. self.decipher = None
  232. self._parser = None
  233. self._cached_objs = {}
  234. self._parsed_objs = {}
  235. self._parsed_everything = False
  236. def _parse_next_object(self, parser):
  237. # This is a bit awkward and I suspect that it could be a lot more elegant, but it would
  238. # require refactoring the parsing process and I don't want to do that yet.
  239. stack = []
  240. _, token = parser.nexttoken()
  241. while token is not self.KEYWORD_OBJ:
  242. stack.append(token)
  243. _, token = parser.nexttoken()
  244. objid = stack[-2]
  245. genno = stack[-1]
  246. _, obj = parser.nextobject()
  247. return objid, genno, obj
  248. def _parse_objstream(self, stream):
  249. # ObjStm have a special organization. First, the param "N" tells how many objs we have in
  250. # there. Then, they start with a list of (objids, genno) pairs, and then the actual objects
  251. # come in.
  252. parser = PDFStreamParser(stream.get_data())
  253. parser.set_document(self)
  254. objcount = stream['N']
  255. objids = []
  256. for i in range(objcount):
  257. _, objid = parser.nextobject()
  258. _, genno = parser.nextobject()
  259. objids.append(objid)
  260. # Now we should be at the point where we read objects
  261. for objid in objids:
  262. _, obj = parser.nextobject()
  263. self._cached_objs[objid] = obj
  264. def _parse_whole(self, parser):
  265. while True:
  266. try:
  267. objid, genno, obj = self._parse_next_object(parser)
  268. self._cached_objs[objid] = obj
  269. if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
  270. obj.set_objid(objid, genno)
  271. self._parse_objstream(obj)
  272. except PSEOF:
  273. break
  274. def _parse_everything(self):
  275. # Sometimes, we have malformed xref, but we still want to manage to read the PDF. In cases
  276. # like these, the last resort is to read all objects at once so that our object reference
  277. # can finally be resolved. This is slower than the normal method, so ony use this when the
  278. # xref tables are corrupt/wrong/whatever.
  279. if self._parsed_everything:
  280. raise PDFAlreadyParsed()
  281. parser = self._parser
  282. parser.setpos(0)
  283. parser.reset()
  284. self._parse_whole(parser)
  285. self._parsed_everything = True
  286. def _getobj(self, objid):
  287. if not self.xrefs:
  288. raise PDFException('PDFDocument is not initialized')
  289. # logger.debug('getobj: objid=%r', objid)
  290. if objid in self._cached_objs:
  291. genno = 0
  292. obj = self._cached_objs[objid]
  293. else:
  294. strmid, index = self.find_obj_ref(objid)
  295. if index is None:
  296. handle_error(PDFSyntaxError, 'Cannot locate objid=%r' % objid)
  297. # return null for a nonexistent reference.
  298. return None
  299. if strmid:
  300. stream = self.getobj(strmid)
  301. if stream is None:
  302. return None
  303. stream = stream_value(stream)
  304. if stream.get('Type') is not LITERAL_OBJSTM:
  305. handle_error(PDFSyntaxError, 'Not a stream object: %r' % stream)
  306. try:
  307. n = stream['N']
  308. except KeyError:
  309. handle_error(PDFSyntaxError, 'N is not defined: %r' % stream)
  310. n = 0
  311. if strmid in self._parsed_objs:
  312. objs = self._parsed_objs[strmid]
  313. else:
  314. parser = PDFStreamParser(stream.get_data())
  315. parser.set_document(self)
  316. objs = []
  317. try:
  318. while True:
  319. _, obj = parser.nextobject()
  320. objs.append(obj)
  321. except PSEOF:
  322. pass
  323. if self.caching:
  324. self._parsed_objs[strmid] = objs
  325. genno = 0
  326. i = n*2+index
  327. try:
  328. obj = objs[i]
  329. except IndexError:
  330. raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
  331. if isinstance(obj, PDFStream):
  332. obj.set_objid(objid, 0)
  333. else:
  334. try:
  335. self._parser.setpos(index)
  336. except PSEOF:
  337. handle_error(PSEOF, 'Parser index out of bounds')
  338. return None
  339. (_,objid1) = self._parser.nexttoken() # objid
  340. (_,genno) = self._parser.nexttoken() # genno
  341. (_,kwd) = self._parser.nexttoken()
  342. # #### hack around malformed pdf files
  343. #assert objid1 == objid, (objid, objid1)
  344. if objid1 != objid:
  345. x = []
  346. while kwd is not self.KEYWORD_OBJ:
  347. (_,kwd) = self._parser.nexttoken()
  348. x.append(kwd)
  349. if x:
  350. objid1 = x[-2]
  351. genno = x[-1]
  352. # #### end hack around malformed pdf files
  353. if kwd is not self.KEYWORD_OBJ:
  354. raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
  355. try:
  356. (_,obj) = self._parser.nextobject()
  357. if isinstance(obj, PDFStream):
  358. obj.set_objid(objid, genno)
  359. except PSEOF:
  360. return None
  361. # logger.debug('register: objid=%r: %r', objid, obj)
  362. if self.caching:
  363. self._cached_objs[objid] = obj
  364. if self.decipher:
  365. obj = decipher_all(self.decipher, objid, genno, obj)
  366. return obj
  367. def set_parser(self, parser):
  368. "Set the document to use a given PDFParser object."
  369. if self._parser:
  370. return
  371. self._parser = parser
  372. # Retrieve the information of each header that was appended
  373. # (maybe multiple times) at the end of the document.
  374. self.xrefs = parser.read_xref()
  375. for xref in self.xrefs:
  376. trailer = xref.get_trailer()
  377. if not trailer: continue
  378. # If there's an encryption info, remember it.
  379. if 'Encrypt' in trailer:
  380. #assert not self.encryption
  381. self.encryption = (list_value(trailer['ID']),
  382. dict_value(trailer['Encrypt']))
  383. if 'Info' in trailer:
  384. self.info.append(dict_value(trailer['Info']))
  385. if 'Root' in trailer:
  386. # Every PDF file must have exactly one /Root dictionary.
  387. self.catalog = dict_value(trailer['Root'])
  388. break
  389. else:
  390. raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
  391. if self.catalog.get('Type') is not LITERAL_CATALOG:
  392. handle_error(PDFSyntaxError, 'Catalog not found!')
  393. # initialize(password='')
  394. # Perform the initialization with a given password.
  395. # This step is mandatory even if there's no password associated
  396. # with the document.
  397. PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
  398. def initialize(self, password=''):
  399. if not self.encryption:
  400. self.is_printable = self.is_modifiable = self.is_extractable = True
  401. return
  402. (docid, param) = self.encryption
  403. if literal_name(param.get('Filter')) != 'Standard':
  404. raise PDFEncryptionError('Unknown filter: param=%r' % param)
  405. V = int_value(param.get('V', 0))
  406. if not (V == 1 or V == 2):
  407. raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
  408. length = int_value(param.get('Length', 40)) # Key length (bits)
  409. O = str_value(param['O'])
  410. R = int_value(param['R']) # Revision
  411. if 5 <= R:
  412. raise PDFEncryptionError('Unknown revision: %r' % R)
  413. U = str_value(param['U'])
  414. P = int_value(param['P'])
  415. self.is_printable = bool(P & 4)
  416. self.is_modifiable = bool(P & 8)
  417. self.is_extractable = bool(P & 16)
  418. # Algorithm 3.2
  419. # XXX is latin-1 the correct encoding???
  420. password = password.encode('latin-1')
  421. password = (password+self.PASSWORD_PADDING)[:32] # 1
  422. hash = md5.md5(password) # 2
  423. hash.update(O) # 3
  424. hash.update(struct.pack('<l', P)) # 4
  425. hash.update(docid[0]) # 5
  426. if 4 <= R:
  427. # 6
  428. raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
  429. if 3 <= R:
  430. # 8
  431. for _ in range(50):
  432. hash = md5.md5(hash.digest()[:length//8])
  433. key = hash.digest()[:length//8]
  434. if R == 2:
  435. # Algorithm 3.4
  436. u1 = Arcfour(key).process(self.PASSWORD_PADDING)
  437. elif R == 3:
  438. # Algorithm 3.5
  439. hash = md5.md5(self.PASSWORD_PADDING) # 2
  440. hash.update(docid[0]) # 3
  441. x = Arcfour(key).process(hash.digest()[:16]) # 4
  442. for i in range(1,19+1):
  443. k = bytes( c ^ i for c in key )
  444. x = Arcfour(k).process(x)
  445. u1 = x+x # 32bytes total
  446. if R == 2:
  447. is_authenticated = (u1 == U)
  448. else:
  449. is_authenticated = (u1[:16] == U[:16])
  450. if not is_authenticated:
  451. raise PDFPasswordIncorrect
  452. self.decrypt_key = key
  453. self.decipher = self.decrypt_rc4 # XXX may be AES
  454. def decrypt_rc4(self, objid, genno, data):
  455. key = self.decrypt_key + struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
  456. hash = md5.md5(key)
  457. key = hash.digest()[:min(len(key),16)]
  458. return Arcfour(key).process(data)
  459. def readobj(self):
  460. """Read the next object at current position.
  461. The object doesn't have to start exactly where we are. We'll read the first
  462. object that comes to us.
  463. """
  464. return self._parse_next_object(self._parser)
  465. def find_obj_ref(self, objid):
  466. for xref in self.xrefs:
  467. try:
  468. strmid, index = xref.get_pos(objid)
  469. return strmid, index
  470. except KeyError:
  471. pass
  472. else:
  473. # return null for a nonexistent reference.
  474. return None, None
  475. def getobj(self, objid):
  476. result = self._getobj(objid)
  477. if result is None:
  478. try:
  479. self._parse_everything()
  480. result = self._getobj(objid)
  481. except PDFAlreadyParsed:
  482. result = None
  483. return result
  484. INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
  485. def get_pages(self):
  486. if not self.xrefs:
  487. raise PDFException('PDFDocument is not initialized')
  488. def search(obj, parent):
  489. try:
  490. if isinstance(obj, int):
  491. objid = obj
  492. tree = dict_value(self.getobj(objid), strict=True).copy()
  493. else:
  494. objid = obj.objid
  495. tree = dict_value(obj, strict=True).copy()
  496. except PDFTypeError:
  497. return
  498. for (k,v) in parent.items():
  499. if k in self.INHERITABLE_ATTRS and k not in tree:
  500. tree[k] = v
  501. if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
  502. logger.debug('Pages: Kids=%r', tree['Kids'])
  503. for c in list_value(tree['Kids']):
  504. for x in search(c, tree):
  505. yield x
  506. elif tree.get('Type') is LITERAL_PAGE:
  507. logger.debug('Page: %r', tree)
  508. yield (objid, tree)
  509. if 'Pages' not in self.catalog:
  510. return
  511. for (pageid,tree) in search(self.catalog['Pages'], self.catalog):
  512. yield PDFPage(self, pageid, tree)
  513. def get_outlines(self):
  514. if 'Outlines' not in self.catalog:
  515. raise PDFNoOutlines
  516. def search(entry, level):
  517. entry = dict_value(entry)
  518. if 'Title' in entry:
  519. if 'A' in entry or 'Dest' in entry:
  520. title = decode_text(str_value(entry['Title']))
  521. dest = entry.get('Dest')
  522. action = entry.get('A')
  523. se = entry.get('SE')
  524. yield (level, title, dest, action, se)
  525. if 'First' in entry and 'Last' in entry:
  526. for x in search(entry['First'], level+1):
  527. yield x
  528. if 'Next' in entry:
  529. for x in search(entry['Next'], level):
  530. yield x
  531. return search(self.catalog['Outlines'], 0)
  532. def lookup_name(self, cat, key):
  533. try:
  534. names = dict_value(self.catalog['Names'])
  535. except (PDFTypeError, KeyError):
  536. raise KeyError((cat,key))
  537. # may raise KeyError
  538. d0 = dict_value(names[cat])
  539. def lookup(d):
  540. if 'Limits' in d:
  541. (k1,k2) = list_value(d['Limits'])
  542. if key < k1 or k2 < key: return None
  543. if 'Names' in d:
  544. objs = list_value(d['Names'])
  545. names = dict(choplist(2, objs))
  546. return names[key]
  547. if 'Kids' in d:
  548. for c in list_value(d['Kids']):
  549. v = lookup(dict_value(c))
  550. if v: return v
  551. raise KeyError((cat,key))
  552. return lookup(d0)
  553. def get_dest(self, name):
  554. try:
  555. # PDF-1.2 or later
  556. obj = self.lookup_name('Dests', name)
  557. except KeyError:
  558. # PDF-1.1 or prior
  559. if 'Dests' not in self.catalog:
  560. raise PDFDestinationNotFound(name)
  561. d0 = dict_value(self.catalog['Dests'])
  562. if name not in d0:
  563. raise PDFDestinationNotFound(name)
  564. obj = d0[name]
  565. return obj
  566. class PDFParser(PSStackParser):
  567. """
  568. PDFParser fetch PDF objects from a file stream.
  569. It can handle indirect references by referring to
  570. a PDF document set by set_document method.
  571. It also reads XRefs at the end of every PDF file.
  572. Typical usage:
  573. parser = PDFParser(fp)
  574. parser.read_xref()
  575. parser.set_document(doc)
  576. parser.seek(offset)
  577. parser.nextobject()
  578. """
  579. def __init__(self, fp):
  580. PSStackParser.__init__(self, fp)
  581. self.doc = None
  582. self.fallback = False
  583. def set_document(self, doc):
  584. """Associates the parser with a PDFDocument object."""
  585. self.doc = doc
  586. KEYWORD_R = KWD('R')
  587. KEYWORD_NULL = KWD('null')
  588. KEYWORD_ENDOBJ = KWD('endobj')
  589. KEYWORD_STREAM = KWD('stream')
  590. KEYWORD_XREF = KWD('xref')
  591. KEYWORD_STARTXREF = KWD('startxref')
  592. def do_keyword(self, pos, token):
  593. """Handles PDF-related keywords."""
  594. if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
  595. self.add_results(*self.pop(1))
  596. elif token is self.KEYWORD_ENDOBJ:
  597. self.add_results(*self.pop(4))
  598. elif token is self.KEYWORD_NULL:
  599. # null object
  600. self.push((pos, None))
  601. elif token is self.KEYWORD_R:
  602. # reference to indirect object
  603. try:
  604. ((_,objid), (_,genno)) = self.pop(2)
  605. (objid, genno) = (int(objid), int(genno))
  606. obj = PDFObjRef(self.doc, objid, genno)
  607. self.push((pos, obj))
  608. except PSSyntaxError:
  609. pass
  610. elif token is self.KEYWORD_STREAM:
  611. # stream object
  612. ((_,dic),) = self.pop(1)
  613. dic = dict_value(dic)
  614. try:
  615. objlen = int_value(dic['Length'])
  616. except KeyError:
  617. handle_error(PDFSyntaxError, '/Length is undefined: %r' % dic)
  618. objlen = 0
  619. self.setpos(pos)
  620. try:
  621. (_, line) = self.nextline() # 'stream'
  622. except PSEOF:
  623. handle_error(PDFSyntaxError, 'Unexpected EOF')
  624. return
  625. pos += len(line)
  626. endpos = pos + objlen
  627. if 'endstream' not in self.data[endpos:endpos+len('endstream')+2]:
  628. r = re.compile(r'(\r\n|\r|\n)endstream')
  629. m = r.search(self.data, pos)
  630. if m is None:
  631. raise PDFSyntaxError("stream with no endstream")
  632. endpos = m.start()
  633. data = self.data[pos:endpos].encode('latin-1')
  634. self.setpos(endpos)
  635. self.nexttoken() # consume 'endstream'
  636. # XXX limit objlen not to exceed object boundary
  637. # logger.debug('Stream: pos=%d, objlen=%d, dic=%r, data=%r...', pos, objlen, dic, data[:10])
  638. obj = PDFStream(dic, data, self.doc.decipher)
  639. self.push((pos, obj))
  640. else:
  641. # others
  642. self.push((pos, token))
  643. def find_xref(self):
  644. """Internal function used to locate the first XRef."""
  645. # the word 'startxref' followed by a newline followed by digits
  646. re_startxref = re.compile(r'startxref\s*[\r\n]+\s*(\d+)', re.MULTILINE)
  647. # try at the end, then try the whole file.
  648. m = re_startxref.findall(self.data, len(self.data)-4096)
  649. if not m:
  650. m = re_startxref.findall(self.data)
  651. if not m:
  652. raise PDFNoValidXRef('Unexpected EOF')
  653. logger.debug('xref found: pos=%r', m[-1])
  654. return int(m[-1])
  655. # read xref table
  656. def read_xref_from(self, start, xrefs):
  657. """Reads XRefs from the given location."""
  658. self.setpos(start)
  659. self.reset()
  660. try:
  661. (pos, token) = self.nexttoken()
  662. except PSEOF:
  663. raise PDFNoValidXRef('Unexpected EOF')
  664. # logger.debug('read_xref_from: start=%d, token=%r', start, token)
  665. if isinstance(token, int):
  666. # XRefStream: PDF-1.5
  667. self.setpos(pos)
  668. self.reset()
  669. xref = PDFXRefStream()
  670. xref.load(self)
  671. else:
  672. if token is self.KEYWORD_XREF:
  673. self.nextline()
  674. xref = PDFXRef()
  675. xref.load(self)
  676. xrefs.append(xref)
  677. trailer = xref.get_trailer()
  678. logger.debug('trailer: %r', trailer)
  679. if 'XRefStm' in trailer:
  680. pos = int_value(trailer['XRefStm'])
  681. self.read_xref_from(pos, xrefs)
  682. if 'Prev' in trailer:
  683. # find previous xref
  684. pos = int_value(trailer['Prev'])
  685. self.read_xref_from(pos, xrefs)
  686. # read xref tables and trailers
  687. def read_xref(self):
  688. """Reads all the XRefs in the PDF file and returns them."""
  689. xrefs = []
  690. try:
  691. pos = self.find_xref()
  692. self.read_xref_from(pos, xrefs)
  693. except PDFNoValidXRef:
  694. # fallback
  695. logger.debug('no xref, fallback')
  696. self.fallback = True
  697. xref = PDFXRef()
  698. xref.load_fallback(self)
  699. xrefs.append(xref)
  700. return xrefs
  701. class PDFStreamParser(PDFParser):
  702. """
  703. PDFStreamParser is used to parse PDF content streams
  704. that is contained in each page and has instructions
  705. for rendering the page. A reference to a PDF document is
  706. needed because a PDF content stream can also have
  707. indirect references to other objects in the same document.
  708. """
  709. def __init__(self, data):
  710. PDFParser.__init__(self, io.BytesIO(data))
  711. def flush(self):
  712. self.add_results(*self.popall())
  713. def do_keyword(self, pos, token):
  714. if token is self.KEYWORD_R:
  715. # reference to indirect object
  716. try:
  717. ((_,objid), (_,genno)) = self.pop(2)
  718. (objid, genno) = (int(objid), int(genno))
  719. obj = PDFObjRef(self.doc, objid, genno)
  720. self.push((pos, obj))
  721. except PSSyntaxError:
  722. pass
  723. return
  724. # others
  725. self.push((pos, token))