import io import re import logging from .cmapdb import CMapDB, CMap from .psparser import PSTypeError, PSEOF from .psparser import PSKeyword, literal_name, keyword_name from .psparser import PSStackParser from .psparser import LIT, KWD, handle_error from .pdftypes import (PDFException, PDFStream, PDFObjRef, resolve1, list_value, dict_value, stream_value) from .pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from .pdfparser import PDFDocument, PDFParser from .pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE from .utils import choplist from .utils import mult_matrix, MATRIX_IDENTITY logger = logging.getLogger(__name__) ## Exceptions ## class PDFResourceError(PDFException): pass class PDFInterpreterError(PDFException): pass ## Constants ## LITERAL_PDF = LIT('PDF') LITERAL_TEXT = LIT('Text') LITERAL_FONT = LIT('Font') LITERAL_FORM = LIT('Form') LITERAL_IMAGE = LIT('Image') class PDFTextState: def __init__(self): self.font = None self.fontsize = 0 self.charspace = 0 self.wordspace = 0 self.scaling = 100 self.leading = 0 self.render = 0 self.rise = 0 self.reset() # self.matrix is set # self.linematrix is set def __repr__(self): return ('' % (self.font, self.fontsize, self.charspace, self.wordspace, self.scaling, self.leading, self.render, self.rise, self.matrix, self.linematrix)) def copy(self): obj = PDFTextState() obj.font = self.font obj.fontsize = self.fontsize obj.charspace = self.charspace obj.wordspace = self.wordspace obj.scaling = self.scaling obj.leading = self.leading obj.render = self.render obj.rise = self.rise obj.matrix = self.matrix obj.linematrix = self.linematrix return obj def reset(self): self.matrix = MATRIX_IDENTITY self.linematrix = (0, 0) class PDFGraphicState: def __init__(self): self.linewidth = 0 self.linecap = None self.linejoin = None self.miterlimit = None self.dash = None self.intent = None self.flatness = None def copy(self): obj = PDFGraphicState() obj.linewidth = self.linewidth obj.linecap = self.linecap obj.linejoin = self.linejoin obj.miterlimit = self.miterlimit obj.dash = self.dash obj.intent = self.intent obj.flatness = self.flatness return obj def __repr__(self): return ('' % (self.linewidth, self.linecap, self.linejoin, self.miterlimit, self.dash, self.intent, self.flatness)) class PDFResourceManager: """Repository of shared resources. ResourceManager facilitates reuse of shared resources such as fonts and images so that large objects are not allocated multiple times. """ def __init__(self, caching=True): self.caching = caching self._cached_fonts = {} def get_procset(self, procs): for proc in procs: if proc is LITERAL_PDF: pass elif proc is LITERAL_TEXT: pass else: #raise PDFResourceError('ProcSet %r is not supported.' % proc) pass def get_cmap(self, cmapname, strict=False): try: return CMapDB.get_cmap(cmapname) except CMapDB.CMapNotFound: if strict: raise return CMap() def get_font(self, objid, spec): if objid and objid in self._cached_fonts: font = self._cached_fonts[objid] else: # logger.debug('get_font: create: objid=%r, spec=%r', objid, spec) if spec['Type'] is not LITERAL_FONT: handle_error(PDFFontError, 'Type is not /Font') # Create a Font object. if 'Subtype' in spec: subtype = literal_name(spec['Subtype']) else: handle_error(PDFFontError, 'Font Subtype is not specified.') subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) assert dfonts subspec = dict_value(dfonts[0]).copy() for k in ('Encoding', 'ToUnicode'): if k in spec: subspec[k] = resolve1(spec[k]) font = self.get_font(None, subspec) else: handle_error(PDFFontError, 'Invalid Font spec: %r' % spec) font = PDFType1Font(self, spec) # this is so wrong! if objid and self.caching: self._cached_fonts[objid] = font return font class PDFContentParser(PSStackParser): def __init__(self, streams): fp = io.StringIO() for stream in streams: stream = stream_value(stream) data = stream.get_data() if isinstance(data, bytes): data = data.decode('latin-1') fp.write(data) fp.seek(0) PSStackParser.__init__(self, fp) def get_inline_data(self, pos, target='EI'): currpos = pos i = 0 data = '' while i <= len(target): if i: c = self.data[currpos] data += c currpos += 1 if len(target) <= i and c.isspace(): i += 1 elif i < len(target) and c == target[i]: i += 1 else: i = 0 else: j = self.data.index(target[0], currpos) data += self.data[currpos:j+1] currpos = j+1 i = 1 data = data[:-(len(target)+1)] # strip the last part data = re.sub(r'(\x0d\x0a|[\x0d\x0a])$', '', data) return (pos, data) def flush(self): self.add_results(*self.popall()) KEYWORD_BI = KWD('BI') KEYWORD_ID = KWD('ID') KEYWORD_EI = KWD('EI') def do_keyword(self, pos, token): if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, 'inline') elif token is self.KEYWORD_ID: try: (_, objs) = self.end_type('inline') if len(objs) % 2 != 0: raise PSTypeError('Invalid dictionary construct: %r' % objs) d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) ) (pos, data) = self.get_inline_data(pos+len('ID ')) obj = PDFStream(d, data) self.push((pos, obj)) self.push((pos, self.KEYWORD_EI)) except PSTypeError as e: handle_error(type(e), str(e)) else: self.push((pos, token)) class PDFPageInterpreter: def __init__(self, rsrcmgr, device): self.rsrcmgr = rsrcmgr self.device = device def dup(self): return PDFPageInterpreter(self.rsrcmgr, self.device) # init_resources(resources): # Prepare the fonts and XObjects listed in the Resource attribute. def init_resources(self, resources): self.resources = resources self.fontmap = {} self.xobjmap = {} self.csmap = PREDEFINED_COLORSPACE.copy() if not resources: return def get_colorspace(spec): if spec is None: return PREDEFINED_COLORSPACE['DeviceRGB'] if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] for (k,v) in dict_value(resources).items(): # logger.debug('Resource: %r: %r', k,v) if k == 'Font': for (fontid,spec) in dict_value(v).items(): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid spec = dict_value(spec) if spec: self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) elif k == 'ColorSpace': for (csid,spec) in dict_value(v).items(): self.csmap[csid] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobjid,xobjstrm) in dict_value(v).items(): self.xobjmap[xobjid] = xobjstrm # init_state(ctm) # Initialize the text and graphic states for rendering a page. def init_state(self, ctm): # gstack: stack for graphical states. self.gstack = [] self.ctm = ctm self.device.set_ctm(self.ctm) self.textstate = PDFTextState() self.graphicstate = PDFGraphicState() self.curpath = [] # argstack: stack for command arguments. self.argstack = [] # set some global states. self.scs = self.ncs = None if self.csmap: self.scs = self.ncs = list(self.csmap.values())[0] def push(self, obj): self.argstack.append(obj) def pop(self, n): if n == 0: return [] x = self.argstack[-n:] self.argstack = self.argstack[:-n] return x def get_current_state(self): return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) def set_current_state(self, state): (self.ctm, self.textstate, self.graphicstate) = state self.device.set_ctm(self.ctm) # gsave def do_q(self): self.gstack.append(self.get_current_state()) # grestore def do_Q(self): if self.gstack: self.set_current_state(self.gstack.pop()) # concat-matrix def do_cm(self, a1, b1, c1, d1, e1, f1): self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm) self.device.set_ctm(self.ctm) # setlinewidth def do_w(self, linewidth): self.graphicstate.linewidth = linewidth # setlinecap def do_J(self, linecap): self.graphicstate.linecap = linecap # setlinejoin def do_j(self, linejoin): self.graphicstate.linejoin = linejoin # setmiterlimit def do_M(self, miterlimit): self.graphicstate.miterlimit = miterlimit # setdash def do_d(self, dash, phase): self.graphicstate.dash = (dash, phase) # setintent def do_ri(self, intent): self.graphicstate.intent = intent # setflatness def do_i(self, flatness): self.graphicstate.flatness = flatness # load-gstate def do_gs(self, name): #XXX pass # moveto def do_m(self, x, y): self.curpath.append(('m',x,y)) # lineto def do_l(self, x, y): self.curpath.append(('l',x,y)) # curveto def do_c(self, x1, y1, x2, y2, x3, y3): self.curpath.append(('c',x1,y1,x2,y2,x3,y3)) # urveto def do_v(self, x2, y2, x3, y3): self.curpath.append(('v',x2,y2,x3,y3)) # rveto def do_y(self, x1, y1, x3, y3): self.curpath.append(('y',x1,y1,x3,y3)) # closepath def do_h(self): self.curpath.append(('h',)) # rectangle def do_re(self, x, y, w, h): self.curpath.append(('m',x,y)) self.curpath.append(('l',x+w,y)) self.curpath.append(('l',x+w,y+h)) self.curpath.append(('l',x,y+h)) self.curpath.append(('h',)) # stroke def do_S(self): self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] # close-and-stroke def do_s(self): self.do_h() self.do_S() # fill def do_f(self): self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] # fill (obsolete) do_F = do_f # fill-even-odd def do_f_a(self): self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] # fill-and-stroke def do_B(self): self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] # fill-and-stroke-even-odd def do_B_a(self): self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] # close-fill-and-stroke def do_b(self): self.do_h() self.do_B() # close-fill-and-stroke-even-odd def do_b_a(self): self.do_h() self.do_B_a() # close-only def do_n(self): self.curpath = [] # clip def do_W(self): pass # clip-even-odd def do_W_a(self): pass # setcolorspace-stroking def do_CS(self, name): self.scs = self.csmap[literal_name(name)] # setcolorspace-non-strokine def do_cs(self, name): self.ncs = self.csmap[literal_name(name)] # setgray-stroking def do_G(self, gray): #self.do_CS(LITERAL_DEVICE_GRAY) pass # setgray-non-stroking def do_g(self, gray): #self.do_cs(LITERAL_DEVICE_GRAY) pass # setrgb-stroking def do_RG(self, r, g, b): #self.do_CS(LITERAL_DEVICE_RGB) pass # setrgb-non-stroking def do_rg(self, r, g, b): #self.do_cs(LITERAL_DEVICE_RGB) pass # setcmyk-stroking def do_K(self, c, m, y, k): #self.do_CS(LITERAL_DEVICE_CMYK) pass # setcmyk-non-stroking def do_k(self, c, m, y, k): #self.do_cs(LITERAL_DEVICE_CMYK) pass # setcolor def do_SCN(self): if self.scs: n = self.scs.ncomponents else: handle_error(PDFInterpreterError, 'No colorspace specified!') n = 1 self.pop(n) def do_scn(self): if self.ncs: n = self.ncs.ncomponents else: handle_error(PDFInterpreterError, 'No colorspace specified!') n = 1 self.pop(n) def do_SC(self): self.do_SCN() def do_sc(self): self.do_scn() # sharing-name def do_sh(self, name): pass # begin-text def do_BT(self): self.textstate.reset() # end-text def do_ET(self): pass # begin-compat def do_BX(self): pass # end-compat def do_EX(self): pass # marked content operators def do_MP(self, tag): self.device.do_tag(tag) def do_DP(self, tag, props): self.device.do_tag(tag, props) def do_BMC(self, tag): self.device.begin_tag(tag) def do_BDC(self, tag, props): self.device.begin_tag(tag, props) def do_EMC(self): self.device.end_tag() # setcharspace def do_Tc(self, space): self.textstate.charspace = space # setwordspace def do_Tw(self, space): self.textstate.wordspace = space # textscale def do_Tz(self, scale): self.textstate.scaling = scale # setleading def do_TL(self, leading): self.textstate.leading = -leading # selectfont def do_Tf(self, fontid, fontsize): try: self.textstate.font = self.fontmap[literal_name(fontid)] except KeyError: handle_error(PDFInterpreterError, 'Undefined Font id: %r' % fontid) return self.textstate.fontsize = fontsize # setrendering def do_Tr(self, render): self.textstate.render = render # settextrise def do_Ts(self, rise): self.textstate.rise = rise # text-move def do_Td(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) self.textstate.linematrix = (0, 0) #print >>sys.stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate) # text-move def do_TD(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) self.textstate.leading = ty self.textstate.linematrix = (0, 0) #print >>sys.stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate) # textmatrix def do_Tm(self, a,b,c,d,e,f): self.textstate.matrix = (a,b,c,d,e,f) self.textstate.linematrix = (0, 0) # nextline def do_T_a(self): (a,b,c,d,e,f) = self.textstate.matrix self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f) self.textstate.linematrix = (0, 0) # show-pos def do_TJ(self, seq): #print >>sys.stderr, 'TJ(%r): %r' % (seq,self.textstate) if self.textstate.font is None: handle_error(PDFInterpreterError, 'No font specified!') return self.device.render_string(self.textstate, seq) # show def do_Tj(self, s): self.do_TJ([s]) # quote def do__q(self, s): self.do_T_a() self.do_TJ([s]) # doublequote def do__w(self, aw, ac, s): self.do_Tw(aw) self.do_Tc(ac) self.do_TJ([s]) # inline image def do_BI(self): # never called pass def do_ID(self): # never called pass def do_EI(self, obj): try: if 'W' in obj and 'H' in obj: iobjid = str(id(obj)) self.device.begin_figure(iobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.render_image(iobjid, obj) self.device.end_figure(iobjid) except TypeError: # Sometimes, 'obj' is a PSLiteral. I'm not sure why, but I'm guessing it's because it's # malformed or something. We can just ignore the thing. logger.warning("Malformed inline image") # invoke an XObject def do_Do(self, xobjid): xobjid = literal_name(xobjid) try: xobj = stream_value(self.xobjmap[xobjid]) except KeyError: handle_error(PDFInterpreterError, 'Undefined xobject id: %r' % xobjid) return logger.debug('Processing xobj: %r', xobj) subtype = xobj.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj: interpreter = self.dup() bbox = list_value(xobj['BBox']) matrix = list_value(xobj.get('Matrix', MATRIX_IDENTITY)) # According to PDF reference 1.7 section 4.9.1, XObjects in # earlier PDFs (prior to v1.2) use the page's Resources entry # instead of having their own Resources entry. resources = dict_value(xobj.get('Resources')) or self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) interpreter.render_contents(resources, [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj and 'Height' in xobj: self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) self.device.render_image(xobjid, xobj) self.device.end_figure(xobjid) else: # unsupported xobject type. pass def process_page(self, page): logger.debug('Processing page: %r', page) (x0,y0,x1,y1) = page.mediabox if page.rotate == 90: ctm = (0,-1,1,0, -y0,x1) elif page.rotate == 180: ctm = (-1,0,0,-1, x1,y1) elif page.rotate == 270: ctm = (0,1,-1,0, y1,-x0) else: ctm = (1,0,0,1, -x0,-y0) self.device.begin_page(page, ctm) self.render_contents(page.resources, page.contents, ctm=ctm) self.device.end_page(page) # render_contents(resources, streams, ctm) # Render the content streams. # This method may be called recursively. def render_contents(self, resources, streams, ctm=MATRIX_IDENTITY): logger.debug('render_contents: resources=%r, streams=%r, ctm=%r', resources, streams, ctm) self.init_resources(resources) self.init_state(ctm) self.execute(list_value(streams)) def execute(self, streams): try: parser = PDFContentParser(streams) except PSEOF: # empty page return while 1: try: (_,obj) = parser.nextobject() except PSEOF: break if isinstance(obj, PSKeyword): name = keyword_name(obj) method = 'do_%s' % name.replace('*','_a').replace('"','_w').replace("'",'_q') if hasattr(self, method): func = getattr(self, method) nargs = func.__code__.co_argcount-1 if nargs: args = self.pop(nargs) # logger.debug('exec: %s %r', name, args) if len(args) == nargs: func(*args) else: # logger.debug('exec: %s', name) func() else: handle_error(PDFInterpreterError, 'Unknown operator: %r' % name) else: self.push(obj) class PDFTextExtractionNotAllowed(PDFInterpreterError): pass def process_pdf(rsrcmgr, device, fp, pagenos=None, maxpages=0, password='', caching=True, check_extractable=True): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument(caching=caching) # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if check_extractable and not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno+1: break