import io import re import struct import hashlib as md5 import logging from .psparser import PSStackParser, PSSyntaxError, PSEOF, literal_name, LIT, KWD, handle_error from .pdftypes import (PDFException, PDFTypeError, PDFNotImplementedError, PDFStream, PDFObjRef, resolve1, decipher_all, int_value, str_value, list_value, dict_value, stream_value) from .arcfour import Arcfour from .utils import choplist, nunpack, decode_text, ObjIdRange logger = logging.getLogger(__name__) ## Exceptions ## class PDFSyntaxError(PDFException): pass class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoOutlines(PDFException): pass class PDFDestinationNotFound(PDFException): pass class PDFAlreadyParsed(PDFException): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass # some predefined literals and keywords. LITERAL_OBJSTM = LIT('ObjStm') LITERAL_XREF = LIT('XRef') LITERAL_PAGE = LIT('Page') LITERAL_PAGES = LIT('Pages') LITERAL_CATALOG = LIT('Catalog') class PDFBaseXRef: def get_trailer(self): raise NotImplementedError def get_objids(self): return [] def get_pos(self, objid): raise KeyError(objid) class PDFXRef(PDFBaseXRef): def __init__(self): self.offsets = {} self.trailer = {} def load(self, parser): while 1: try: (pos, line) = parser.nextline() if not line.strip(): continue except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') if not line: raise PDFNoValidXRef('Premature eof: %r' % parser) if line.startswith('trailer'): parser.setpos(pos) break f = line.strip().split(' ') if len(f) != 2: raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) try: (start, nobjs) = list(map(int, f)) except ValueError: raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) for objid in range(start, start+nobjs): try: (_, line) = parser.nextline() except PSEOF: raise PDFNoValidXRef('Unexpected EOF - file corrupted?') f = line.strip().split(' ') if len(f) != 3: raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) (pos, genno, use) = f if use != 'n': continue self.offsets[objid] = (int(genno), int(pos)) logger.debug('xref objects: %r', self.offsets) self.load_trailer(parser) KEYWORD_TRAILER = KWD('trailer') def load_trailer(self, parser): try: (_,kwd) = parser.nexttoken() assert kwd is self.KEYWORD_TRAILER (_,dic) = parser.nextobject() except PSEOF: x = parser.pop(1) if not x: raise PDFNoValidXRef('Unexpected EOF - file corrupted') (_,dic) = x[0] self.trailer.update(dict_value(dic)) PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') def load_fallback(self, parser, debug=0): parser.setpos(0) while 1: try: (pos, line) = parser.nextline() except PSEOF: break if line.startswith('trailer'): parser.setpos(pos) self.load_trailer(parser) logger.debug('trailer: %r', self.get_trailer()) break m = self.PDFOBJ_CUE.match(line) if not m: continue (objid, genno) = m.groups() self.offsets[int(objid)] = (0, pos) def get_trailer(self): return self.trailer def get_objids(self): return iter(self.offsets.keys()) def get_pos(self, objid): try: (genno, pos) = self.offsets[objid] except KeyError: raise return (None, pos) class PDFXRefStream(PDFBaseXRef): def __init__(self): self.data = None self.entlen = None self.fl1 = self.fl2 = self.fl3 = None self.objid_ranges = [] def __repr__(self): return '' % (self.fl1, self.fl2, self.fl3) def load(self, parser): (_,objid) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored (_,kwd) = parser.nexttoken() (_,stream) = parser.nextobject() if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream['Size'] index_array = stream.get('Index', (0,size)) if len(index_array) % 2 != 0: raise PDFSyntaxError('Invalid index number') self.objid_ranges.extend( ObjIdRange(start, nobjs) for (start,nobjs) in choplist(2, index_array) ) (self.fl1, self.fl2, self.fl3) = stream['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.attrs if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug('xref stream: objid=%s, fields=%d,%d,%d', ', '.join(map(repr, self.objid_ranges)), self.fl1, self.fl2, self.fl3) def get_trailer(self): return self.trailer def get_objids(self): for objid_range in self.objid_ranges: for x in range(objid_range.get_start_id(), objid_range.get_end_id()+1): yield x def get_pos(self, objid): offset = 0 found = False for objid_range in self.objid_ranges: if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id(): offset += objid - objid_range.get_start_id() found = True break else: offset += objid_range.get_nobjs() if not found: raise KeyError(objid) i = self.entlen * offset ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1: pos = nunpack(ent[self.fl1:self.fl1+self.fl2]) genno = nunpack(ent[self.fl1+self.fl2:]) return (None, pos) elif f1 == 2: objid = nunpack(ent[self.fl1:self.fl1+self.fl2]) index = nunpack(ent[self.fl1+self.fl2:]) return (objid, index) # this is a free object raise KeyError(objid) class PDFPage: """An object that holds the information about a page. A PDFPage object is merely a convenience class that has a set of keys and values, which describe the properties of a page and point to its contents. Attributes: doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. contents: a list of PDFStream objects that represents the page content. lastmod: the last modified time of the page. resources: a list of resources used by the page. mediabox: the physical size of the page. cropbox: the crop rectangle of the page. rotate: the page rotation (in degree). annots: the page annotations. beads: a chain that represents natural reading order. """ def __init__(self, doc, pageid, attrs): """Initialize a page object. doc: a PDFDocument object. pageid: any Python object that can uniquely identify the page. attrs: a dictionary of page attributes. """ self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = (self.attrs.get('Rotate', 0)+360) % 360 self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents def __repr__(self): return '' % (self.resources, self.mediabox) class PDFDocument: """PDFDocument object represents a PDF document. Since a PDF file can be very big, normally it is not loaded at once. So PDF document has to cooperate with a PDF parser in order to dynamically import the data as processing goes. Typical usage: doc = PDFDocument() doc.set_parser(parser) doc.initialize(password) obj = doc.getobj(objid) """ KEYWORD_OBJ = KWD('obj') def __init__(self, caching=True): self.caching = caching self.xrefs = [] self.info = [] self.catalog = None self.encryption = None self.decipher = None self._parser = None self._cached_objs = {} self._parsed_objs = {} self._parsed_everything = False def _parse_next_object(self, parser): # This is a bit awkward and I suspect that it could be a lot more elegant, but it would # require refactoring the parsing process and I don't want to do that yet. stack = [] _, token = parser.nexttoken() while token is not self.KEYWORD_OBJ: stack.append(token) _, token = parser.nexttoken() objid = stack[-2] genno = stack[-1] _, obj = parser.nextobject() return objid, genno, obj def _parse_objstream(self, stream): # ObjStm have a special organization. First, the param "N" tells how many objs we have in # there. Then, they start with a list of (objids, genno) pairs, and then the actual objects # come in. parser = PDFStreamParser(stream.get_data()) parser.set_document(self) objcount = stream['N'] objids = [] for i in range(objcount): _, objid = parser.nextobject() _, genno = parser.nextobject() objids.append(objid) # Now we should be at the point where we read objects for objid in objids: _, obj = parser.nextobject() self._cached_objs[objid] = obj def _parse_whole(self, parser): while True: try: objid, genno, obj = self._parse_next_object(parser) self._cached_objs[objid] = obj if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM: obj.set_objid(objid, genno) self._parse_objstream(obj) except PSEOF: break def _parse_everything(self): # Sometimes, we have malformed xref, but we still want to manage to read the PDF. In cases # like these, the last resort is to read all objects at once so that our object reference # can finally be resolved. This is slower than the normal method, so ony use this when the # xref tables are corrupt/wrong/whatever. if self._parsed_everything: raise PDFAlreadyParsed() parser = self._parser parser.setpos(0) parser.reset() self._parse_whole(parser) self._parsed_everything = True def _getobj(self, objid): if not self.xrefs: raise PDFException('PDFDocument is not initialized') # logger.debug('getobj: objid=%r', objid) if objid in self._cached_objs: genno = 0 obj = self._cached_objs[objid] else: strmid, index = self.find_obj_ref(objid) if index is None: handle_error(PDFSyntaxError, 'Cannot locate objid=%r' % objid) # return null for a nonexistent reference. return None if strmid: stream = self.getobj(strmid) if stream is None: return None stream = stream_value(stream) if stream.get('Type') is not LITERAL_OBJSTM: handle_error(PDFSyntaxError, 'Not a stream object: %r' % stream) try: n = stream['N'] except KeyError: handle_error(PDFSyntaxError, 'N is not defined: %r' % stream) n = 0 if strmid in self._parsed_objs: objs = self._parsed_objs[strmid] else: parser = PDFStreamParser(stream.get_data()) parser.set_document(self) objs = [] try: while True: _, obj = parser.nextobject() objs.append(obj) except PSEOF: pass if self.caching: self._parsed_objs[strmid] = objs genno = 0 i = n*2+index try: obj = objs[i] except IndexError: raise PDFSyntaxError('Invalid object number: objid=%r' % (objid)) if isinstance(obj, PDFStream): obj.set_objid(objid, 0) else: try: self._parser.setpos(index) except PSEOF: handle_error(PSEOF, 'Parser index out of bounds') return None (_,objid1) = self._parser.nexttoken() # objid (_,genno) = self._parser.nexttoken() # genno (_,kwd) = self._parser.nexttoken() # #### hack around malformed pdf files #assert objid1 == objid, (objid, objid1) if objid1 != objid: x = [] while kwd is not self.KEYWORD_OBJ: (_,kwd) = self._parser.nexttoken() x.append(kwd) if x: objid1 = x[-2] genno = x[-1] # #### end hack around malformed pdf files if kwd is not self.KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % index) try: (_,obj) = self._parser.nextobject() if isinstance(obj, PDFStream): obj.set_objid(objid, genno) except PSEOF: return None # logger.debug('register: objid=%r: %r', objid, obj) if self.caching: self._cached_objs[objid] = obj if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) return obj def set_parser(self, parser): "Set the document to use a given PDFParser object." if self._parser: return self._parser = parser # Retrieve the information of each header that was appended # (maybe multiple times) at the end of the document. self.xrefs = parser.read_xref() for xref in self.xrefs: trailer = xref.get_trailer() if not trailer: continue # If there's an encryption info, remember it. if 'Encrypt' in trailer: #assert not self.encryption self.encryption = (list_value(trailer['ID']), dict_value(trailer['Encrypt'])) if 'Info' in trailer: self.info.append(dict_value(trailer['Info'])) if 'Root' in trailer: # Every PDF file must have exactly one /Root dictionary. self.catalog = dict_value(trailer['Root']) break else: raise PDFSyntaxError('No /Root object! - Is this really a PDF?') if self.catalog.get('Type') is not LITERAL_CATALOG: handle_error(PDFSyntaxError, 'Catalog not found!') # initialize(password='') # Perform the initialization with a given password. # This step is mandatory even if there's no password associated # with the document. PASSWORD_PADDING = b'(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' def initialize(self, password=''): if not self.encryption: self.is_printable = self.is_modifiable = self.is_extractable = True return (docid, param) = self.encryption if literal_name(param.get('Filter')) != 'Standard': raise PDFEncryptionError('Unknown filter: param=%r' % param) V = int_value(param.get('V', 0)) if not (V == 1 or V == 2): raise PDFEncryptionError('Unknown algorithm: param=%r' % param) length = int_value(param.get('Length', 40)) # Key length (bits) O = str_value(param['O']) R = int_value(param['R']) # Revision if 5 <= R: raise PDFEncryptionError('Unknown revision: %r' % R) U = str_value(param['U']) P = int_value(param['P']) self.is_printable = bool(P & 4) self.is_modifiable = bool(P & 8) self.is_extractable = bool(P & 16) # Algorithm 3.2 # XXX is latin-1 the correct encoding??? password = password.encode('latin-1') password = (password+self.PASSWORD_PADDING)[:32] # 1 hash = md5.md5(password) # 2 hash.update(O) # 3 hash.update(struct.pack('