You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
4.9 KiB

4 years ago
  1. import sys
  2. from .utils import mult_matrix, translate_matrix
  3. from .utils import htmlescape, bbox2str
  4. from .pdffont import PDFUnicodeNotDefined
  5. class PDFDevice:
  6. def __init__(self, rsrcmgr):
  7. self.rsrcmgr = rsrcmgr
  8. self.ctm = None
  9. def __repr__(self):
  10. return '<PDFDevice>'
  11. def close(self):
  12. pass
  13. def set_ctm(self, ctm):
  14. self.ctm = ctm
  15. def begin_tag(self, tag, props=None):
  16. pass
  17. def end_tag(self):
  18. pass
  19. def do_tag(self, tag, props=None):
  20. pass
  21. def begin_page(self, page, ctm):
  22. pass
  23. def end_page(self, page):
  24. pass
  25. def begin_figure(self, name, bbox, matrix):
  26. pass
  27. def end_figure(self, name):
  28. pass
  29. def paint_path(self, graphicstate, stroke, fill, evenodd, path):
  30. pass
  31. def render_image(self, name, stream):
  32. pass
  33. def render_string(self, textstate, seq):
  34. pass
  35. class PDFTextDevice(PDFDevice):
  36. def render_string(self, textstate, seq):
  37. matrix = mult_matrix(textstate.matrix, self.ctm)
  38. font = textstate.font
  39. fontsize = textstate.fontsize
  40. scaling = textstate.scaling * .01
  41. charspace = textstate.charspace * scaling
  42. wordspace = textstate.wordspace * scaling
  43. rise = textstate.rise
  44. if font.is_multibyte():
  45. wordspace = 0
  46. dxscale = .001 * fontsize * scaling
  47. if font.is_vertical():
  48. textstate.linematrix = self.render_string_vertical(
  49. seq, matrix, textstate.linematrix, font, fontsize,
  50. scaling, charspace, wordspace, rise, dxscale)
  51. else:
  52. textstate.linematrix = self.render_string_horizontal(
  53. seq, matrix, textstate.linematrix, font, fontsize,
  54. scaling, charspace, wordspace, rise, dxscale)
  55. def render_string_horizontal(self, seq, matrix, point, font, fontsize, scaling, charspace,
  56. wordspace, rise, dxscale):
  57. (x,y) = point
  58. needcharspace = False
  59. for obj in seq:
  60. if isinstance(obj, (int, float)):
  61. x -= obj*dxscale
  62. needcharspace = True
  63. else:
  64. for cid in font.decode(obj):
  65. if needcharspace:
  66. x += charspace
  67. x += self.render_char(translate_matrix(matrix, (x,y)),
  68. font, fontsize, scaling, rise, cid)
  69. if cid == 32 and wordspace:
  70. x += wordspace
  71. needcharspace = True
  72. return (x, y)
  73. def render_string_vertical(self, seq, matrix, point, font, fontsize, scaling, charspace,
  74. wordspace, rise, dxscale):
  75. (x,y) = point
  76. needcharspace = False
  77. for obj in seq:
  78. if isinstance(obj, (int, float)):
  79. y -= obj*dxscale
  80. needcharspace = True
  81. else:
  82. for cid in font.decode(obj):
  83. if needcharspace:
  84. y += charspace
  85. y += self.render_char(translate_matrix(matrix, (x,y)),
  86. font, fontsize, scaling, rise, cid)
  87. if cid == 32 and wordspace:
  88. y += wordspace
  89. needcharspace = True
  90. return (x, y)
  91. def render_char(self, matrix, font, fontsize, scaling, rise, cid):
  92. return 0
  93. class TagExtractor(PDFDevice):
  94. def __init__(self, rsrcmgr, outfp):
  95. PDFDevice.__init__(self, rsrcmgr)
  96. self.outfp = outfp
  97. self.pageno = 0
  98. self._stack = []
  99. def render_string(self, textstate, seq):
  100. font = textstate.font
  101. text = ''
  102. for obj in seq:
  103. if not isinstance(obj, str):
  104. continue
  105. chars = font.decode(obj)
  106. for cid in chars:
  107. try:
  108. char = font.to_unichr(cid)
  109. text += char
  110. except PDFUnicodeNotDefined:
  111. pass
  112. self.outfp.write(htmlescape(text, self.outfp.encoding))
  113. def begin_page(self, page, ctm):
  114. self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
  115. (self.pageno, bbox2str(page.mediabox), page.rotate))
  116. def end_page(self, page):
  117. self.outfp.write('</page>\n')
  118. self.pageno += 1
  119. def begin_tag(self, tag, props=None):
  120. s = ''
  121. if isinstance(props, dict):
  122. s = ''.join( ' %s="%s"' % (htmlescape(k), htmlescape(str(v))) for (k,v)
  123. in sorted(props.items()) )
  124. self.outfp.write('<%s%s>' % (htmlescape(tag.name), s))
  125. self._stack.append(tag)
  126. def end_tag(self):
  127. assert self._stack
  128. tag = self._stack.pop(-1)
  129. self.outfp.write('</%s>' % htmlescape(tag.name))
  130. def do_tag(self, tag, props=None):
  131. self.begin_tag(tag, props)
  132. self._stack.pop(-1)