You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

221 lines
6.8 KiB

4 years ago
  1. #!/home/alpcentaur/ProjektA/PrototypeWebApp/venv/bin/python3.5
  2. #
  3. # dumppdf.py - dump pdf contents in XML format.
  4. #
  5. # usage: dumppdf.py [options] [files ...]
  6. # options:
  7. # -i objid : object id
  8. #
  9. import sys, re
  10. from pdfminer.psparser import PSKeyword, PSLiteral
  11. from pdfminer.pdfparser import PDFDocument, PDFParser, PDFNoOutlines
  12. from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
  13. from pdfminer.utils import set_debug_logging
  14. ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
  15. def e(s):
  16. return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
  17. # dumpxml
  18. def dumpxml(out, obj, codec=None):
  19. if obj is None:
  20. out.write('<null />')
  21. return
  22. if isinstance(obj, dict):
  23. out.write('<dict size="%d">\n' % len(obj))
  24. for (k,v) in obj.items():
  25. out.write('<key>%s</key>\n' % k)
  26. out.write('<value>')
  27. dumpxml(out, v)
  28. out.write('</value>\n')
  29. out.write('</dict>')
  30. return
  31. if isinstance(obj, list):
  32. out.write('<list size="%d">\n' % len(obj))
  33. for v in obj:
  34. dumpxml(out, v)
  35. out.write('\n')
  36. out.write('</list>')
  37. return
  38. if isinstance(obj, bytes):
  39. obj = obj.decode('latin-1')
  40. if isinstance(obj, str):
  41. out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
  42. return
  43. if isinstance(obj, PDFStream):
  44. if codec == 'raw':
  45. out.write(obj.get_rawdata())
  46. elif codec == 'binary':
  47. out.write(obj.get_data())
  48. else:
  49. out.write('<stream>\n<props>\n')
  50. dumpxml(out, obj.attrs)
  51. out.write('\n</props>\n')
  52. if codec == 'text':
  53. data = obj.get_data()
  54. out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
  55. out.write('</stream>')
  56. return
  57. if isinstance(obj, PDFObjRef):
  58. out.write('<ref id="%d" />' % obj.objid)
  59. return
  60. if isinstance(obj, PSKeyword):
  61. out.write('<keyword>%s</keyword>' % obj.name)
  62. return
  63. if isinstance(obj, PSLiteral):
  64. out.write('<literal>%s</literal>' % obj.name)
  65. return
  66. if isinstance(obj, int) or isinstance(obj, float):
  67. out.write('<number>%s</number>' % obj)
  68. return
  69. raise TypeError(obj)
  70. # dumptrailers
  71. def dumptrailers(out, doc):
  72. for xref in doc.xrefs:
  73. out.write('<trailer>\n')
  74. dumpxml(out, xref.trailer)
  75. out.write('\n</trailer>\n\n')
  76. # dumpallobjs
  77. def dumpallobjs(out, doc, codec=None):
  78. out.write('<pdf>')
  79. for xref in doc.xrefs:
  80. for objid in xref.get_objids():
  81. try:
  82. obj = doc.getobj(objid)
  83. if obj is None: continue
  84. out.write('<object id="%d">\n' % objid)
  85. dumpxml(out, obj, codec=codec)
  86. out.write('\n</object>\n\n')
  87. except:
  88. raise
  89. dumptrailers(out, doc)
  90. out.write('</pdf>')
  91. # dumpoutline
  92. def dumpoutline(outfp, fname, objids, pagenos, password='',
  93. dumpall=False, codec=None):
  94. doc = PDFDocument()
  95. fp = open(fname, 'rb')
  96. parser = PDFParser(fp)
  97. parser.set_document(doc)
  98. doc.set_parser(parser)
  99. doc.initialize(password)
  100. pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  101. def resolve_dest(dest):
  102. if isinstance(dest, str):
  103. dest = resolve1(doc.get_dest(dest))
  104. elif isinstance(dest, PSLiteral):
  105. dest = resolve1(doc.get_dest(dest.name))
  106. if isinstance(dest, dict):
  107. dest = dest['D']
  108. return dest
  109. try:
  110. outlines = doc.get_outlines()
  111. outfp.write('<outlines>\n')
  112. for (level,title,dest,a,se) in outlines:
  113. pageno = None
  114. if dest:
  115. dest = resolve_dest(dest)
  116. pageno = pages[dest[0].objid]
  117. elif a:
  118. action = a.resolve()
  119. if isinstance(action, dict):
  120. subtype = action.get('S')
  121. if subtype and repr(subtype) == '/GoTo' and action.get('D'):
  122. dest = resolve_dest(action['D'])
  123. pageno = pages[dest[0].objid]
  124. s = e(title).encode('utf-8', 'xmlcharrefreplace')
  125. outfp.write('<outline level="%r" title="%s">\n' % (level, s))
  126. if dest is not None:
  127. outfp.write('<dest>')
  128. dumpxml(outfp, dest)
  129. outfp.write('</dest>\n')
  130. if pageno is not None:
  131. outfp.write('<pageno>%r</pageno>\n' % pageno)
  132. outfp.write('</outline>\n')
  133. outfp.write('</outlines>\n')
  134. except PDFNoOutlines:
  135. pass
  136. parser.close()
  137. fp.close()
  138. # dumppdf
  139. def dumppdf(outfp, fname, objids, pagenos, password='',
  140. dumpall=False, codec=None):
  141. doc = PDFDocument()
  142. fp = open(fname, 'rb')
  143. parser = PDFParser(fp)
  144. parser.set_document(doc)
  145. doc.set_parser(parser)
  146. doc.initialize(password)
  147. if objids:
  148. for objid in objids:
  149. obj = doc.getobj(objid)
  150. dumpxml(outfp, obj, codec=codec)
  151. if pagenos:
  152. for (pageno,page) in enumerate(doc.get_pages()):
  153. if pageno in pagenos:
  154. if codec:
  155. for obj in page.contents:
  156. obj = stream_value(obj)
  157. dumpxml(outfp, obj, codec=codec)
  158. else:
  159. dumpxml(outfp, page.attrs)
  160. if dumpall:
  161. dumpallobjs(outfp, doc, codec=codec)
  162. if (not objids) and (not pagenos) and (not dumpall):
  163. dumptrailers(outfp, doc)
  164. fp.close()
  165. if codec not in ('raw','binary'):
  166. outfp.write('\n')
  167. # main
  168. def main(argv):
  169. import getopt
  170. def usage():
  171. print('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-i objid] file ...' % argv[0])
  172. return 100
  173. try:
  174. (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTi:')
  175. except getopt.GetoptError:
  176. return usage()
  177. if not args: return usage()
  178. objids = []
  179. pagenos = set()
  180. codec = None
  181. password = ''
  182. dumpall = False
  183. proc = dumppdf
  184. outfp = sys.stdout
  185. for (k, v) in opts:
  186. if k == '-d': set_debug_logging()
  187. elif k == '-i': objids.extend( int(x) for x in v.split(',') )
  188. elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
  189. elif k == '-P': password = v
  190. elif k == '-a': dumpall = True
  191. elif k == '-r': codec = 'raw'
  192. elif k == '-b': codec = 'binary'
  193. elif k == '-t': codec = 'text'
  194. elif k == '-T': proc = dumpoutline
  195. elif k == '-o': outfp = open(v, 'wb')
  196. #
  197. for fname in args:
  198. proc(outfp, fname, objids, pagenos, password=password,
  199. dumpall=dumpall, codec=codec)
  200. if __name__ == '__main__':
  201. sys.exit(main(sys.argv))