#!/home/alpcentaur/ProjektA/PrototypeWebApp/venv/bin/python3.5
#
# dumppdf.py - dump pdf contents in XML format.
#
# usage: dumppdf.py [options] [files ...]
# options:
# -i objid : object id
#
import sys, re
from pdfminer.psparser import PSKeyword, PSLiteral
from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
from pdfminer.utils import set_debug_logging
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def e(s):
return ESC_PAT.sub(lambda m:'%d;' % ord(m.group(0)), s)
# dumpxml
def dumpxml(out, obj, codec=None):
if obj is None:
out.write('')
return
if isinstance(obj, dict):
out.write('\n' % len(obj))
for (k,v) in obj.items():
out.write('%s\n' % k)
out.write('')
dumpxml(out, v)
out.write('\n')
out.write('')
return
if isinstance(obj, list):
out.write('\n' % len(obj))
for v in obj:
dumpxml(out, v)
out.write('\n')
out.write('
')
return
if isinstance(obj, bytes):
obj = obj.decode('latin-1')
if isinstance(obj, str):
out.write('%s' % (len(obj), e(obj)))
return
if isinstance(obj, PDFStream):
if codec == 'raw':
out.write(obj.get_rawdata())
elif codec == 'binary':
out.write(obj.get_data())
else:
out.write('\n\n')
dumpxml(out, obj.attrs)
out.write('\n\n')
if codec == 'text':
data = obj.get_data()
out.write('%s\n' % (len(data), e(data)))
out.write('')
return
if isinstance(obj, PDFObjRef):
out.write('' % obj.objid)
return
if isinstance(obj, PSKeyword):
out.write('%s' % obj.name)
return
if isinstance(obj, PSLiteral):
out.write('%s' % obj.name)
return
if isinstance(obj, int) or isinstance(obj, float):
out.write('%s' % obj)
return
raise TypeError(obj)
# dumptrailers
def dumptrailers(out, doc):
for xref in doc.xrefs:
out.write('\n')
dumpxml(out, xref.trailer)
out.write('\n\n\n')
# dumpallobjs
def dumpallobjs(out, doc, codec=None):
out.write('')
for xref in doc.xrefs:
for objid in xref.get_objids():
try:
obj = doc.getobj(objid)
if obj is None: continue
out.write('\n\n')
except:
raise
dumptrailers(out, doc)
out.write('')
# dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = open(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
def resolve_dest(dest):
if isinstance(dest, str):
dest = resolve1(doc.get_dest(dest))
elif isinstance(dest, PSLiteral):
dest = resolve1(doc.get_dest(dest.name))
if isinstance(dest, dict):
dest = dest['D']
return dest
try:
outlines = doc.get_outlines()
outfp.write('\n')
for (level,title,dest,a,se) in outlines:
pageno = None
if dest:
dest = resolve_dest(dest)
pageno = pages[dest[0].objid]
elif a:
action = a.resolve()
if isinstance(action, dict):
subtype = action.get('S')
if subtype and repr(subtype) == '/GoTo' and action.get('D'):
dest = resolve_dest(action['D'])
pageno = pages[dest[0].objid]
s = e(title).encode('utf-8', 'xmlcharrefreplace')
outfp.write('\n' % (level, s))
if dest is not None:
outfp.write('')
dumpxml(outfp, dest)
outfp.write('\n')
if pageno is not None:
outfp.write('%r\n' % pageno)
outfp.write('\n')
outfp.write('\n')
except PDFNoOutlines:
pass
parser.close()
fp.close()
# dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None):
doc = PDFDocument()
fp = open(fname, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(password)
if objids:
for objid in objids:
obj = doc.getobj(objid)
dumpxml(outfp, obj, codec=codec)
if pagenos:
for (pageno,page) in enumerate(doc.get_pages()):
if pageno in pagenos:
if codec:
for obj in page.contents:
obj = stream_value(obj)
dumpxml(outfp, obj, codec=codec)
else:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pagenos) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()
if codec not in ('raw','binary'):
outfp.write('\n')
# main
def main(argv):
import getopt
def usage():
print('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0])
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
except getopt.GetoptError:
return usage()
if not args: return usage()
objids = []
pagenos = set()
codec = None
password = ''
dumpall = False
proc = dumppdf
outfp = sys.stdout
for (k, v) in opts:
if k == '-d': set_debug_logging()
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v
elif k == '-a': dumpall = True
elif k == '-r': codec = 'raw'
elif k == '-b': codec = 'binary'
elif k == '-t': codec = 'text'
elif k == '-T': proc = dumpoutline
elif k == '-o': outfp = open(v, 'wb')
#
for fname in args:
proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec)
if __name__ == '__main__':
sys.exit(main(sys.argv))