You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

460 lines
18 KiB

4 years ago
  1. import os.path
  2. import logging
  3. from .pdfdevice import PDFTextDevice
  4. from .pdffont import PDFUnicodeNotDefined
  5. from .pdftypes import LITERALS_DCT_DECODE
  6. from .pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
  7. from .layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
  8. from .layout import LTFigure, LTImage, LTChar, LTTextLine
  9. from .layout import LTTextBox, LTTextBoxVertical, LTTextGroup
  10. from .utils import apply_matrix_pt, mult_matrix
  11. from .utils import htmlescape, bbox2str, create_bmp
  12. logger = logging.getLogger(__name__)
  13. class PDFLayoutAnalyzer(PDFTextDevice):
  14. def __init__(self, rsrcmgr, pageno=1, laparams=None):
  15. PDFTextDevice.__init__(self, rsrcmgr)
  16. self.pageno = pageno
  17. self.laparams = laparams
  18. self._stack = []
  19. def begin_page(self, page, ctm):
  20. (x0,y0,x1,y1) = page.mediabox
  21. (x0,y0) = apply_matrix_pt(ctm, (x0,y0))
  22. (x1,y1) = apply_matrix_pt(ctm, (x1,y1))
  23. mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
  24. self.cur_item = LTPage(self.pageno, mediabox)
  25. def end_page(self, page):
  26. assert not self._stack
  27. assert isinstance(self.cur_item, LTPage)
  28. if self.laparams is not None:
  29. self.cur_item.analyze(self.laparams)
  30. self.pageno += 1
  31. self.receive_layout(self.cur_item)
  32. def begin_figure(self, name, bbox, matrix):
  33. self._stack.append(self.cur_item)
  34. self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
  35. def end_figure(self, _):
  36. fig = self.cur_item
  37. assert isinstance(self.cur_item, LTFigure)
  38. self.cur_item = self._stack.pop()
  39. self.cur_item.add(fig)
  40. def render_image(self, name, stream):
  41. assert isinstance(self.cur_item, LTFigure)
  42. item = LTImage(name, stream,
  43. (self.cur_item.x0, self.cur_item.y0,
  44. self.cur_item.x1, self.cur_item.y1))
  45. self.cur_item.add(item)
  46. def paint_path(self, gstate, stroke, fill, evenodd, path):
  47. shape = ''.join(x[0] for x in path)
  48. if shape == 'ml':
  49. # horizontal/vertical line
  50. (_,x0,y0) = path[0]
  51. (_,x1,y1) = path[1]
  52. (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
  53. (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
  54. if x0 == x1 or y0 == y1:
  55. self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
  56. return
  57. if shape == 'mlllh':
  58. # rectangle
  59. (_,x0,y0) = path[0]
  60. (_,x1,y1) = path[1]
  61. (_,x2,y2) = path[2]
  62. (_,x3,y3) = path[3]
  63. (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
  64. (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
  65. (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
  66. (x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
  67. if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
  68. (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
  69. self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
  70. return
  71. # other shapes
  72. pts = []
  73. for p in path:
  74. for i in range(1, len(p), 2):
  75. pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
  76. self.cur_item.add(LTCurve(gstate.linewidth, pts))
  77. def render_char(self, matrix, font, fontsize, scaling, rise, cid):
  78. try:
  79. text = font.to_unichr(cid)
  80. assert isinstance(text, str), text
  81. except PDFUnicodeNotDefined:
  82. text = self.handle_undefined_char(font, cid)
  83. textwidth = font.char_width(cid)
  84. textdisp = font.char_disp(cid)
  85. item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
  86. self.cur_item.add(item)
  87. return item.adv
  88. def handle_undefined_char(self, font, cid):
  89. logger.warning('undefined: %r, %r', font, cid)
  90. return '(cid:%d)' % cid
  91. def receive_layout(self, ltpage):
  92. pass
  93. class PDFPageAggregator(PDFLayoutAnalyzer):
  94. def __init__(self, rsrcmgr, pageno=1, laparams=None):
  95. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  96. self.result = None
  97. def receive_layout(self, ltpage):
  98. self.result = ltpage
  99. def get_result(self):
  100. return self.result
  101. ## PDFConverter
  102. ##
  103. class PDFConverter(PDFLayoutAnalyzer):
  104. # outfp is an fp opened in *text* mode
  105. def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None):
  106. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  107. self.outfp = outfp
  108. def write_image(self, image):
  109. stream = image.stream
  110. filters = stream.get_filters()
  111. if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
  112. ext = '.jpg'
  113. data = stream.get_rawdata()
  114. elif image.colorspace is LITERAL_DEVICE_RGB:
  115. ext = '.bmp'
  116. data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
  117. elif image.colorspace is LITERAL_DEVICE_GRAY:
  118. ext = '.bmp'
  119. data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
  120. else:
  121. ext = '.img'
  122. data = stream.get_data()
  123. name = image.name+ext
  124. path = os.path.join(self.outdir, name)
  125. fp = file(path, 'wb')
  126. fp.write(data)
  127. fp.close()
  128. return name
  129. ## TextConverter
  130. ##
  131. class TextConverter(PDFConverter):
  132. def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None,
  133. showpageno=False):
  134. PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams)
  135. self.showpageno = showpageno
  136. def write_text(self, text):
  137. self.outfp.write(text)
  138. def receive_layout(self, ltpage):
  139. def render(item):
  140. if isinstance(item, LTContainer):
  141. for child in item:
  142. render(child)
  143. elif isinstance(item, LTText):
  144. self.write_text(item.get_text())
  145. if isinstance(item, LTTextBox):
  146. self.write_text('\n')
  147. if self.showpageno:
  148. self.write_text('Page %s\n' % ltpage.pageid)
  149. render(ltpage)
  150. self.write_text('\f')
  151. # Some dummy functions to save memory/CPU when all that is wanted is text.
  152. # This stops all the image and drawing ouput from being recorded and taking
  153. # up RAM.
  154. def render_image(self, name, stream):
  155. pass
  156. def paint_path(self, gstate, stroke, fill, evenodd, path):
  157. pass
  158. ## HTMLConverter
  159. ##
  160. class HTMLConverter(PDFConverter):
  161. RECT_COLORS = {
  162. #'char': 'green',
  163. 'figure': 'yellow',
  164. 'textline': 'magenta',
  165. 'textbox': 'cyan',
  166. 'textgroup': 'red',
  167. 'curve': 'black',
  168. 'page': 'gray',
  169. }
  170. TEXT_COLORS = {
  171. 'textbox': 'blue',
  172. 'char': 'black',
  173. }
  174. def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None,
  175. scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
  176. pagemargin=50, outdir=None,
  177. rect_colors={'curve':'black', 'page':'gray'},
  178. text_colors={'char':'black'},
  179. debug=False):
  180. PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams)
  181. self.scale = scale
  182. self.fontscale = fontscale
  183. self.layoutmode = layoutmode
  184. self.showpageno = showpageno
  185. self.pagemargin = pagemargin
  186. self.outdir = outdir
  187. self.rect_colors = rect_colors
  188. self.text_colors = text_colors
  189. if debug:
  190. self.rect_colors.update(self.RECT_COLORS)
  191. self.text_colors.update(self.TEXT_COLORS)
  192. self._yoffset = self.pagemargin
  193. self._font = None
  194. self._fontstack = []
  195. self.write_header()
  196. def write(self, text):
  197. self.outfp.write(text)
  198. def write_header(self):
  199. self.write('<html><head>\n')
  200. self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.outfp.encoding)
  201. self.write('</head><body>\n')
  202. def write_footer(self):
  203. self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
  204. ', '.join('<a href="#%s">%s</a>' % (i,i) for i in range(1,self.pageno)))
  205. self.write('</body></html>\n')
  206. def write_text(self, text):
  207. self.write(htmlescape(text, self.outfp.encoding))
  208. def place_rect(self, color, borderwidth, x, y, w, h):
  209. color = self.rect_colors.get(color)
  210. if color is not None:
  211. self.write('<span style="position:absolute; border: %s %dpx solid; '
  212. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
  213. (color, borderwidth,
  214. x*self.scale, (self._yoffset-y)*self.scale,
  215. w*self.scale, h*self.scale))
  216. def place_border(self, color, borderwidth, item):
  217. self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
  218. def place_image(self, item, borderwidth, x, y, w, h):
  219. if self.outdir is not None:
  220. name = self.write_image(item)
  221. self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
  222. 'width="%d" height="%d" />\n' %
  223. (enc(name), borderwidth,
  224. x*self.scale, (self._yoffset-y)*self.scale,
  225. w*self.scale, h*self.scale))
  226. def place_text(self, color, text, x, y, size):
  227. color = self.text_colors.get(color)
  228. if color is not None:
  229. self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
  230. (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale))
  231. self.write_text(text)
  232. self.write('</span>\n')
  233. def begin_textbox(self, color, borderwidth, x, y, w, h, writing_mode):
  234. self._fontstack.append(self._font)
  235. self._font = None
  236. self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; '
  237. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' %
  238. (color, borderwidth, writing_mode,
  239. x*self.scale, (self._yoffset-y)*self.scale,
  240. w*self.scale, h*self.scale))
  241. def put_text(self, text, fontname, fontsize):
  242. font = (fontname, fontsize)
  243. if font != self._font:
  244. if self._font is not None:
  245. self.write('</span>')
  246. self.write('<span style="font-family: %s; font-size:%dpx">' %
  247. (fontname, fontsize * self.scale * self.fontscale))
  248. self._font = font
  249. self.write_text(text)
  250. def put_newline(self):
  251. self.write('<br>')
  252. def end_textbox(self, color):
  253. if self._font is not None:
  254. self.write('</span>')
  255. self._font = self._fontstack.pop()
  256. self.write('</div>')
  257. def receive_layout(self, ltpage):
  258. def show_group(item):
  259. if isinstance(item, LTTextGroup):
  260. self.place_border('textgroup', 1, item)
  261. for child in item:
  262. show_group(child)
  263. def render(item):
  264. if isinstance(item, LTPage):
  265. self._yoffset += item.y1
  266. self.place_border('page', 1, item)
  267. if self.showpageno:
  268. self.write('<div style="position:absolute; top:%dpx;">' %
  269. ((self._yoffset-item.y1)*self.scale))
  270. self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
  271. for child in item:
  272. render(child)
  273. if item.groups is not None:
  274. for group in item.groups:
  275. show_group(group)
  276. elif isinstance(item, LTCurve):
  277. self.place_border('curve', 1, item)
  278. elif isinstance(item, LTFigure):
  279. self.place_border('figure', 1, item)
  280. for child in item:
  281. render(child)
  282. elif isinstance(item, LTImage):
  283. self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
  284. else:
  285. if self.layoutmode == 'exact':
  286. if isinstance(item, LTTextLine):
  287. self.place_border('textline', 1, item)
  288. for child in item:
  289. render(child)
  290. elif isinstance(item, LTTextBox):
  291. self.place_border('textbox', 1, item)
  292. self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
  293. for child in item:
  294. render(child)
  295. elif isinstance(item, LTChar):
  296. self.place_border('char', 1, item)
  297. self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
  298. else:
  299. if isinstance(item, LTTextLine):
  300. for child in item:
  301. render(child)
  302. if self.layoutmode != 'loose':
  303. self.put_newline()
  304. elif isinstance(item, LTTextBox):
  305. self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
  306. item.get_writing_mode())
  307. for child in item:
  308. render(child)
  309. self.end_textbox('textbox')
  310. elif isinstance(item, LTChar):
  311. self.put_text(item.get_text(), item.fontname, item.size)
  312. elif isinstance(item, LTText):
  313. self.write_text(item.get_text())
  314. render(ltpage)
  315. self._yoffset += self.pagemargin
  316. def close(self):
  317. self.write_footer()
  318. class XMLConverter(PDFConverter):
  319. def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None, outdir=None):
  320. PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams)
  321. self.outdir = outdir
  322. self.write_header()
  323. def write_header(self):
  324. self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.outfp.encoding)
  325. self.outfp.write('<pages>\n')
  326. def write_footer(self):
  327. self.outfp.write('</pages>\n')
  328. def write_text(self, text):
  329. self.outfp.write(htmlescape(text, self.outfp.encoding))
  330. def receive_layout(self, ltpage):
  331. def show_group(item):
  332. if isinstance(item, LTTextBox):
  333. self.outfp.write('<textbox id="%d" bbox="%s" />\n' %
  334. (item.index, bbox2str(item.bbox)))
  335. elif isinstance(item, LTTextGroup):
  336. self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
  337. for child in item:
  338. show_group(child)
  339. self.outfp.write('</textgroup>\n')
  340. def render(item):
  341. if isinstance(item, LTPage):
  342. self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
  343. (item.pageid, bbox2str(item.bbox), item.rotate))
  344. for child in item:
  345. render(child)
  346. if item.groups is not None:
  347. self.outfp.write('<layout>\n')
  348. for group in item.groups:
  349. show_group(group)
  350. self.outfp.write('</layout>\n')
  351. self.outfp.write('</page>\n')
  352. elif isinstance(item, LTLine):
  353. self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
  354. (item.linewidth, bbox2str(item.bbox)))
  355. elif isinstance(item, LTRect):
  356. self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
  357. (item.linewidth, bbox2str(item.bbox)))
  358. elif isinstance(item, LTCurve):
  359. self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
  360. (item.linewidth, bbox2str(item.bbox), item.get_pts()))
  361. elif isinstance(item, LTFigure):
  362. self.outfp.write('<figure name="%s" bbox="%s">\n' %
  363. (item.name, bbox2str(item.bbox)))
  364. for child in item:
  365. render(child)
  366. self.outfp.write('</figure>\n')
  367. elif isinstance(item, LTTextLine):
  368. self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
  369. for child in item:
  370. render(child)
  371. self.outfp.write('</textline>\n')
  372. elif isinstance(item, LTTextBox):
  373. wmode = ''
  374. if isinstance(item, LTTextBoxVertical):
  375. wmode = ' wmode="vertical"'
  376. self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' %
  377. (item.index, bbox2str(item.bbox), wmode))
  378. for child in item:
  379. render(child)
  380. self.outfp.write('</textbox>\n')
  381. elif isinstance(item, LTChar):
  382. self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
  383. (htmlescape(item.fontname), bbox2str(item.bbox), item.size))
  384. self.write_text(item.get_text())
  385. self.outfp.write('</text>\n')
  386. elif isinstance(item, LTText):
  387. self.outfp.write('<text>%s</text>\n' % item.get_text())
  388. elif isinstance(item, LTImage):
  389. if self.outdir:
  390. name = self.write_image(item)
  391. self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
  392. (enc(name), item.width, item.height))
  393. else:
  394. self.outfp.write('<image width="%d" height="%d" />\n' %
  395. (item.width, item.height))
  396. else:
  397. assert 0, item
  398. render(ltpage)
  399. def close(self):
  400. self.write_footer()