You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

708 lines
23 KiB

4 years ago
  1. import io
  2. import re
  3. import logging
  4. from .cmapdb import CMapDB, CMap
  5. from .psparser import PSTypeError, PSEOF
  6. from .psparser import PSKeyword, literal_name, keyword_name
  7. from .psparser import PSStackParser
  8. from .psparser import LIT, KWD, handle_error
  9. from .pdftypes import (PDFException, PDFStream, PDFObjRef, resolve1, list_value, dict_value,
  10. stream_value)
  11. from .pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
  12. from .pdfparser import PDFDocument, PDFParser
  13. from .pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE
  14. from .utils import choplist
  15. from .utils import mult_matrix, MATRIX_IDENTITY
  16. logger = logging.getLogger(__name__)
  17. ## Exceptions
  18. ##
  19. class PDFResourceError(PDFException): pass
  20. class PDFInterpreterError(PDFException): pass
  21. ## Constants
  22. ##
  23. LITERAL_PDF = LIT('PDF')
  24. LITERAL_TEXT = LIT('Text')
  25. LITERAL_FONT = LIT('Font')
  26. LITERAL_FORM = LIT('Form')
  27. LITERAL_IMAGE = LIT('Image')
  28. class PDFTextState:
  29. def __init__(self):
  30. self.font = None
  31. self.fontsize = 0
  32. self.charspace = 0
  33. self.wordspace = 0
  34. self.scaling = 100
  35. self.leading = 0
  36. self.render = 0
  37. self.rise = 0
  38. self.reset()
  39. # self.matrix is set
  40. # self.linematrix is set
  41. def __repr__(self):
  42. return ('<PDFTextState: font=%r, fontsize=%r, charspace=%r, wordspace=%r, '
  43. ' scaling=%r, leading=%r, render=%r, rise=%r, '
  44. ' matrix=%r, linematrix=%r>' %
  45. (self.font, self.fontsize, self.charspace, self.wordspace,
  46. self.scaling, self.leading, self.render, self.rise,
  47. self.matrix, self.linematrix))
  48. def copy(self):
  49. obj = PDFTextState()
  50. obj.font = self.font
  51. obj.fontsize = self.fontsize
  52. obj.charspace = self.charspace
  53. obj.wordspace = self.wordspace
  54. obj.scaling = self.scaling
  55. obj.leading = self.leading
  56. obj.render = self.render
  57. obj.rise = self.rise
  58. obj.matrix = self.matrix
  59. obj.linematrix = self.linematrix
  60. return obj
  61. def reset(self):
  62. self.matrix = MATRIX_IDENTITY
  63. self.linematrix = (0, 0)
  64. class PDFGraphicState:
  65. def __init__(self):
  66. self.linewidth = 0
  67. self.linecap = None
  68. self.linejoin = None
  69. self.miterlimit = None
  70. self.dash = None
  71. self.intent = None
  72. self.flatness = None
  73. def copy(self):
  74. obj = PDFGraphicState()
  75. obj.linewidth = self.linewidth
  76. obj.linecap = self.linecap
  77. obj.linejoin = self.linejoin
  78. obj.miterlimit = self.miterlimit
  79. obj.dash = self.dash
  80. obj.intent = self.intent
  81. obj.flatness = self.flatness
  82. return obj
  83. def __repr__(self):
  84. return ('<PDFGraphicState: linewidth=%r, linecap=%r, linejoin=%r, '
  85. ' miterlimit=%r, dash=%r, intent=%r, flatness=%r>' %
  86. (self.linewidth, self.linecap, self.linejoin,
  87. self.miterlimit, self.dash, self.intent, self.flatness))
  88. class PDFResourceManager:
  89. """Repository of shared resources.
  90. ResourceManager facilitates reuse of shared resources
  91. such as fonts and images so that large objects are not
  92. allocated multiple times.
  93. """
  94. def __init__(self, caching=True):
  95. self.caching = caching
  96. self._cached_fonts = {}
  97. def get_procset(self, procs):
  98. for proc in procs:
  99. if proc is LITERAL_PDF:
  100. pass
  101. elif proc is LITERAL_TEXT:
  102. pass
  103. else:
  104. #raise PDFResourceError('ProcSet %r is not supported.' % proc)
  105. pass
  106. def get_cmap(self, cmapname, strict=False):
  107. try:
  108. return CMapDB.get_cmap(cmapname)
  109. except CMapDB.CMapNotFound:
  110. if strict: raise
  111. return CMap()
  112. def get_font(self, objid, spec):
  113. if objid and objid in self._cached_fonts:
  114. font = self._cached_fonts[objid]
  115. else:
  116. # logger.debug('get_font: create: objid=%r, spec=%r', objid, spec)
  117. if spec['Type'] is not LITERAL_FONT:
  118. handle_error(PDFFontError, 'Type is not /Font')
  119. # Create a Font object.
  120. if 'Subtype' in spec:
  121. subtype = literal_name(spec['Subtype'])
  122. else:
  123. handle_error(PDFFontError, 'Font Subtype is not specified.')
  124. subtype = 'Type1'
  125. if subtype in ('Type1', 'MMType1'):
  126. # Type1 Font
  127. font = PDFType1Font(self, spec)
  128. elif subtype == 'TrueType':
  129. # TrueType Font
  130. font = PDFTrueTypeFont(self, spec)
  131. elif subtype == 'Type3':
  132. # Type3 Font
  133. font = PDFType3Font(self, spec)
  134. elif subtype in ('CIDFontType0', 'CIDFontType2'):
  135. # CID Font
  136. font = PDFCIDFont(self, spec)
  137. elif subtype == 'Type0':
  138. # Type0 Font
  139. dfonts = list_value(spec['DescendantFonts'])
  140. assert dfonts
  141. subspec = dict_value(dfonts[0]).copy()
  142. for k in ('Encoding', 'ToUnicode'):
  143. if k in spec:
  144. subspec[k] = resolve1(spec[k])
  145. font = self.get_font(None, subspec)
  146. else:
  147. handle_error(PDFFontError, 'Invalid Font spec: %r' % spec)
  148. font = PDFType1Font(self, spec) # this is so wrong!
  149. if objid and self.caching:
  150. self._cached_fonts[objid] = font
  151. return font
  152. class PDFContentParser(PSStackParser):
  153. def __init__(self, streams):
  154. fp = io.StringIO()
  155. for stream in streams:
  156. stream = stream_value(stream)
  157. data = stream.get_data()
  158. if isinstance(data, bytes):
  159. data = data.decode('latin-1')
  160. fp.write(data)
  161. fp.seek(0)
  162. PSStackParser.__init__(self, fp)
  163. def get_inline_data(self, pos, target='EI'):
  164. currpos = pos
  165. i = 0
  166. data = ''
  167. while i <= len(target):
  168. if i:
  169. c = self.data[currpos]
  170. data += c
  171. currpos += 1
  172. if len(target) <= i and c.isspace():
  173. i += 1
  174. elif i < len(target) and c == target[i]:
  175. i += 1
  176. else:
  177. i = 0
  178. else:
  179. j = self.data.index(target[0], currpos)
  180. data += self.data[currpos:j+1]
  181. currpos = j+1
  182. i = 1
  183. data = data[:-(len(target)+1)] # strip the last part
  184. data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data)
  185. return (pos, data)
  186. def flush(self):
  187. self.add_results(*self.popall())
  188. KEYWORD_BI = KWD('BI')
  189. KEYWORD_ID = KWD('ID')
  190. KEYWORD_EI = KWD('EI')
  191. def do_keyword(self, pos, token):
  192. if token is self.KEYWORD_BI:
  193. # inline image within a content stream
  194. self.start_type(pos, 'inline')
  195. elif token is self.KEYWORD_ID:
  196. try:
  197. (_, objs) = self.end_type('inline')
  198. if len(objs) % 2 != 0:
  199. raise PSTypeError('Invalid dictionary construct: %r' % objs)
  200. d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
  201. (pos, data) = self.get_inline_data(pos+len('ID '))
  202. obj = PDFStream(d, data)
  203. self.push((pos, obj))
  204. self.push((pos, self.KEYWORD_EI))
  205. except PSTypeError as e:
  206. handle_error(type(e), str(e))
  207. else:
  208. self.push((pos, token))
  209. class PDFPageInterpreter:
  210. def __init__(self, rsrcmgr, device):
  211. self.rsrcmgr = rsrcmgr
  212. self.device = device
  213. def dup(self):
  214. return PDFPageInterpreter(self.rsrcmgr, self.device)
  215. # init_resources(resources):
  216. # Prepare the fonts and XObjects listed in the Resource attribute.
  217. def init_resources(self, resources):
  218. self.resources = resources
  219. self.fontmap = {}
  220. self.xobjmap = {}
  221. self.csmap = PREDEFINED_COLORSPACE.copy()
  222. if not resources:
  223. return
  224. def get_colorspace(spec):
  225. if spec is None:
  226. return PREDEFINED_COLORSPACE['DeviceRGB']
  227. if isinstance(spec, list):
  228. name = literal_name(spec[0])
  229. else:
  230. name = literal_name(spec)
  231. if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
  232. return PDFColorSpace(name, stream_value(spec[1])['N'])
  233. elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
  234. return PDFColorSpace(name, len(list_value(spec[1])))
  235. else:
  236. return PREDEFINED_COLORSPACE[name]
  237. for (k,v) in dict_value(resources).items():
  238. # logger.debug('Resource: %r: %r', k,v)
  239. if k == 'Font':
  240. for (fontid,spec) in dict_value(v).items():
  241. objid = None
  242. if isinstance(spec, PDFObjRef):
  243. objid = spec.objid
  244. spec = dict_value(spec)
  245. if spec:
  246. self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
  247. elif k == 'ColorSpace':
  248. for (csid,spec) in dict_value(v).items():
  249. self.csmap[csid] = get_colorspace(resolve1(spec))
  250. elif k == 'ProcSet':
  251. self.rsrcmgr.get_procset(list_value(v))
  252. elif k == 'XObject':
  253. for (xobjid,xobjstrm) in dict_value(v).items():
  254. self.xobjmap[xobjid] = xobjstrm
  255. # init_state(ctm)
  256. # Initialize the text and graphic states for rendering a page.
  257. def init_state(self, ctm):
  258. # gstack: stack for graphical states.
  259. self.gstack = []
  260. self.ctm = ctm
  261. self.device.set_ctm(self.ctm)
  262. self.textstate = PDFTextState()
  263. self.graphicstate = PDFGraphicState()
  264. self.curpath = []
  265. # argstack: stack for command arguments.
  266. self.argstack = []
  267. # set some global states.
  268. self.scs = self.ncs = None
  269. if self.csmap:
  270. self.scs = self.ncs = list(self.csmap.values())[0]
  271. def push(self, obj):
  272. self.argstack.append(obj)
  273. def pop(self, n):
  274. if n == 0:
  275. return []
  276. x = self.argstack[-n:]
  277. self.argstack = self.argstack[:-n]
  278. return x
  279. def get_current_state(self):
  280. return (self.ctm, self.textstate.copy(), self.graphicstate.copy())
  281. def set_current_state(self, state):
  282. (self.ctm, self.textstate, self.graphicstate) = state
  283. self.device.set_ctm(self.ctm)
  284. # gsave
  285. def do_q(self):
  286. self.gstack.append(self.get_current_state())
  287. # grestore
  288. def do_Q(self):
  289. if self.gstack:
  290. self.set_current_state(self.gstack.pop())
  291. # concat-matrix
  292. def do_cm(self, a1, b1, c1, d1, e1, f1):
  293. self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
  294. self.device.set_ctm(self.ctm)
  295. # setlinewidth
  296. def do_w(self, linewidth):
  297. self.graphicstate.linewidth = linewidth
  298. # setlinecap
  299. def do_J(self, linecap):
  300. self.graphicstate.linecap = linecap
  301. # setlinejoin
  302. def do_j(self, linejoin):
  303. self.graphicstate.linejoin = linejoin
  304. # setmiterlimit
  305. def do_M(self, miterlimit):
  306. self.graphicstate.miterlimit = miterlimit
  307. # setdash
  308. def do_d(self, dash, phase):
  309. self.graphicstate.dash = (dash, phase)
  310. # setintent
  311. def do_ri(self, intent):
  312. self.graphicstate.intent = intent
  313. # setflatness
  314. def do_i(self, flatness):
  315. self.graphicstate.flatness = flatness
  316. # load-gstate
  317. def do_gs(self, name):
  318. #XXX
  319. pass
  320. # moveto
  321. def do_m(self, x, y):
  322. self.curpath.append(('m',x,y))
  323. # lineto
  324. def do_l(self, x, y):
  325. self.curpath.append(('l',x,y))
  326. # curveto
  327. def do_c(self, x1, y1, x2, y2, x3, y3):
  328. self.curpath.append(('c',x1,y1,x2,y2,x3,y3))
  329. # urveto
  330. def do_v(self, x2, y2, x3, y3):
  331. self.curpath.append(('v',x2,y2,x3,y3))
  332. # rveto
  333. def do_y(self, x1, y1, x3, y3):
  334. self.curpath.append(('y',x1,y1,x3,y3))
  335. # closepath
  336. def do_h(self):
  337. self.curpath.append(('h',))
  338. # rectangle
  339. def do_re(self, x, y, w, h):
  340. self.curpath.append(('m',x,y))
  341. self.curpath.append(('l',x+w,y))
  342. self.curpath.append(('l',x+w,y+h))
  343. self.curpath.append(('l',x,y+h))
  344. self.curpath.append(('h',))
  345. # stroke
  346. def do_S(self):
  347. self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
  348. self.curpath = []
  349. # close-and-stroke
  350. def do_s(self):
  351. self.do_h()
  352. self.do_S()
  353. # fill
  354. def do_f(self):
  355. self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
  356. self.curpath = []
  357. # fill (obsolete)
  358. do_F = do_f
  359. # fill-even-odd
  360. def do_f_a(self):
  361. self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
  362. self.curpath = []
  363. # fill-and-stroke
  364. def do_B(self):
  365. self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
  366. self.curpath = []
  367. # fill-and-stroke-even-odd
  368. def do_B_a(self):
  369. self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
  370. self.curpath = []
  371. # close-fill-and-stroke
  372. def do_b(self):
  373. self.do_h()
  374. self.do_B()
  375. # close-fill-and-stroke-even-odd
  376. def do_b_a(self):
  377. self.do_h()
  378. self.do_B_a()
  379. # close-only
  380. def do_n(self):
  381. self.curpath = []
  382. # clip
  383. def do_W(self):
  384. pass
  385. # clip-even-odd
  386. def do_W_a(self):
  387. pass
  388. # setcolorspace-stroking
  389. def do_CS(self, name):
  390. self.scs = self.csmap[literal_name(name)]
  391. # setcolorspace-non-strokine
  392. def do_cs(self, name):
  393. self.ncs = self.csmap[literal_name(name)]
  394. # setgray-stroking
  395. def do_G(self, gray):
  396. #self.do_CS(LITERAL_DEVICE_GRAY)
  397. pass
  398. # setgray-non-stroking
  399. def do_g(self, gray):
  400. #self.do_cs(LITERAL_DEVICE_GRAY)
  401. pass
  402. # setrgb-stroking
  403. def do_RG(self, r, g, b):
  404. #self.do_CS(LITERAL_DEVICE_RGB)
  405. pass
  406. # setrgb-non-stroking
  407. def do_rg(self, r, g, b):
  408. #self.do_cs(LITERAL_DEVICE_RGB)
  409. pass
  410. # setcmyk-stroking
  411. def do_K(self, c, m, y, k):
  412. #self.do_CS(LITERAL_DEVICE_CMYK)
  413. pass
  414. # setcmyk-non-stroking
  415. def do_k(self, c, m, y, k):
  416. #self.do_cs(LITERAL_DEVICE_CMYK)
  417. pass
  418. # setcolor
  419. def do_SCN(self):
  420. if self.scs:
  421. n = self.scs.ncomponents
  422. else:
  423. handle_error(PDFInterpreterError, 'No colorspace specified!')
  424. n = 1
  425. self.pop(n)
  426. def do_scn(self):
  427. if self.ncs:
  428. n = self.ncs.ncomponents
  429. else:
  430. handle_error(PDFInterpreterError, 'No colorspace specified!')
  431. n = 1
  432. self.pop(n)
  433. def do_SC(self):
  434. self.do_SCN()
  435. def do_sc(self):
  436. self.do_scn()
  437. # sharing-name
  438. def do_sh(self, name):
  439. pass
  440. # begin-text
  441. def do_BT(self):
  442. self.textstate.reset()
  443. # end-text
  444. def do_ET(self):
  445. pass
  446. # begin-compat
  447. def do_BX(self):
  448. pass
  449. # end-compat
  450. def do_EX(self):
  451. pass
  452. # marked content operators
  453. def do_MP(self, tag):
  454. self.device.do_tag(tag)
  455. def do_DP(self, tag, props):
  456. self.device.do_tag(tag, props)
  457. def do_BMC(self, tag):
  458. self.device.begin_tag(tag)
  459. def do_BDC(self, tag, props):
  460. self.device.begin_tag(tag, props)
  461. def do_EMC(self):
  462. self.device.end_tag()
  463. # setcharspace
  464. def do_Tc(self, space):
  465. self.textstate.charspace = space
  466. # setwordspace
  467. def do_Tw(self, space):
  468. self.textstate.wordspace = space
  469. # textscale
  470. def do_Tz(self, scale):
  471. self.textstate.scaling = scale
  472. # setleading
  473. def do_TL(self, leading):
  474. self.textstate.leading = -leading
  475. # selectfont
  476. def do_Tf(self, fontid, fontsize):
  477. try:
  478. self.textstate.font = self.fontmap[literal_name(fontid)]
  479. except KeyError:
  480. handle_error(PDFInterpreterError, 'Undefined Font id: %r' % fontid)
  481. return
  482. self.textstate.fontsize = fontsize
  483. # setrendering
  484. def do_Tr(self, render):
  485. self.textstate.render = render
  486. # settextrise
  487. def do_Ts(self, rise):
  488. self.textstate.rise = rise
  489. # text-move
  490. def do_Td(self, tx, ty):
  491. (a,b,c,d,e,f) = self.textstate.matrix
  492. self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
  493. self.textstate.linematrix = (0, 0)
  494. #print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate)
  495. # text-move
  496. def do_TD(self, tx, ty):
  497. (a,b,c,d,e,f) = self.textstate.matrix
  498. self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f)
  499. self.textstate.leading = ty
  500. self.textstate.linematrix = (0, 0)
  501. #print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate)
  502. # textmatrix
  503. def do_Tm(self, a,b,c,d,e,f):
  504. self.textstate.matrix = (a,b,c,d,e,f)
  505. self.textstate.linematrix = (0, 0)
  506. # nextline
  507. def do_T_a(self):
  508. (a,b,c,d,e,f) = self.textstate.matrix
  509. self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f)
  510. self.textstate.linematrix = (0, 0)
  511. # show-pos
  512. def do_TJ(self, seq):
  513. #print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate)
  514. if self.textstate.font is None:
  515. handle_error(PDFInterpreterError, 'No font specified!')
  516. return
  517. self.device.render_string(self.textstate, seq)
  518. # show
  519. def do_Tj(self, s):
  520. self.do_TJ([s])
  521. # quote
  522. def do__q(self, s):
  523. self.do_T_a()
  524. self.do_TJ([s])
  525. # doublequote
  526. def do__w(self, aw, ac, s):
  527. self.do_Tw(aw)
  528. self.do_Tc(ac)
  529. self.do_TJ([s])
  530. # inline image
  531. def do_BI(self): # never called
  532. pass
  533. def do_ID(self): # never called
  534. pass
  535. def do_EI(self, obj):
  536. try:
  537. if 'W' in obj and 'H' in obj:
  538. iobjid = str(id(obj))
  539. self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY)
  540. self.device.render_image(iobjid, obj)
  541. self.device.end_figure(iobjid)
  542. except TypeError:
  543. # Sometimes, 'obj' is a PSLiteral. I'm not sure why, but I'm guessing it's because it's
  544. # malformed or something. We can just ignore the thing.
  545. logger.warning("Malformed inline image")
  546. # invoke an XObject
  547. def do_Do(self, xobjid):
  548. xobjid = literal_name(xobjid)
  549. try:
  550. xobj = stream_value(self.xobjmap[xobjid])
  551. except KeyError:
  552. handle_error(PDFInterpreterError, 'Undefined xobject id: %r' % xobjid)
  553. return
  554. logger.debug('Processing xobj: %r', xobj)
  555. subtype = xobj.get('Subtype')
  556. if subtype is LITERAL_FORM and 'BBox' in xobj:
  557. interpreter = self.dup()
  558. bbox = list_value(xobj['BBox'])
  559. matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY))
  560. # According to PDF reference 1.7 section 4.9.1, XObjects in
  561. # earlier PDFs (prior to v1.2) use the page's Resources entry
  562. # instead of having their own Resources entry.
  563. resources = dict_value(xobj.get('Resources')) or self.resources.copy()
  564. self.device.begin_figure(xobjid, bbox, matrix)
  565. interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm))
  566. self.device.end_figure(xobjid)
  567. elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj:
  568. self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY)
  569. self.device.render_image(xobjid, xobj)
  570. self.device.end_figure(xobjid)
  571. else:
  572. # unsupported xobject type.
  573. pass
  574. def process_page(self, page):
  575. logger.debug('Processing page: %r', page)
  576. (x0,y0,x1,y1) = page.mediabox
  577. if page.rotate == 90:
  578. ctm = (0,-1,1,0, -y0,x1)
  579. elif page.rotate == 180:
  580. ctm = (-1,0,0,-1, x1,y1)
  581. elif page.rotate == 270:
  582. ctm = (0,1,-1,0, y1,-x0)
  583. else:
  584. ctm = (1,0,0,1, -x0,-y0)
  585. self.device.begin_page(page, ctm)
  586. self.render_contents(page.resources, page.contents, ctm=ctm)
  587. self.device.end_page(page)
  588. # render_contents(resources, streams, ctm)
  589. # Render the content streams.
  590. # This method may be called recursively.
  591. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY):
  592. logger.debug('render_contents: resources=%r, streams=%r, ctm=%r', resources, streams, ctm)
  593. self.init_resources(resources)
  594. self.init_state(ctm)
  595. self.execute(list_value(streams))
  596. def execute(self, streams):
  597. try:
  598. parser = PDFContentParser(streams)
  599. except PSEOF:
  600. # empty page
  601. return
  602. while 1:
  603. try:
  604. (_,obj) = parser.nextobject()
  605. except PSEOF:
  606. break
  607. if isinstance(obj, PSKeyword):
  608. name = keyword_name(obj)
  609. method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q')
  610. if hasattr(self, method):
  611. func = getattr(self, method)
  612. nargs = func.__code__.co_argcount-1
  613. if nargs:
  614. args = self.pop(nargs)
  615. # logger.debug('exec: %s %r', name, args)
  616. if len(args) == nargs:
  617. func(*args)
  618. else:
  619. # logger.debug('exec: %s', name)
  620. func()
  621. else:
  622. handle_error(PDFInterpreterError, 'Unknown operator: %r' % name)
  623. else:
  624. self.push(obj)
  625. class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
  626. def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='',
  627. caching=True, check_extractable=True):
  628. # Create a PDF parser object associated with the file object.
  629. parser = PDFParser(fp)
  630. # Create a PDF document object that stores the document structure.
  631. doc = PDFDocument(caching=caching)
  632. # Connect the parser and document objects.
  633. parser.set_document(doc)
  634. doc.set_parser(parser)
  635. # Supply the document password for initialization.
  636. # (If no password is set, give an empty string.)
  637. doc.initialize(password)
  638. # Check if the document allows text extraction. If not, abort.
  639. if check_extractable and not doc.is_extractable:
  640. raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
  641. # Create a PDF interpreter object.
  642. interpreter = PDFPageInterpreter(rsrcmgr, device)
  643. # Process each page contained in the document.
  644. for (pageno,page) in enumerate(doc.get_pages()):
  645. if pagenos and (pageno not in pagenos): continue
  646. interpreter.process_page(page)
  647. if maxpages and maxpages <= pageno+1: break