import sys from .utils import mult_matrix, translate_matrix from .utils import htmlescape, bbox2str from .pdffont import PDFUnicodeNotDefined class PDFDevice: def __init__(self, rsrcmgr): self.rsrcmgr = rsrcmgr self.ctm = None def __repr__(self): return '' def close(self): pass def set_ctm(self, ctm): self.ctm = ctm def begin_tag(self, tag, props=None): pass def end_tag(self): pass def do_tag(self, tag, props=None): pass def begin_page(self, page, ctm): pass def end_page(self, page): pass def begin_figure(self, name, bbox, matrix): pass def end_figure(self, name): pass def paint_path(self, graphicstate, stroke, fill, evenodd, path): pass def render_image(self, name, stream): pass def render_string(self, textstate, seq): pass class PDFTextDevice(PDFDevice): def render_string(self, textstate, seq): matrix = mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize scaling = textstate.scaling * .01 charspace = textstate.charspace * scaling wordspace = textstate.wordspace * scaling rise = textstate.rise if font.is_multibyte(): wordspace = 0 dxscale = .001 * fontsize * scaling if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, scaling, charspace, wordspace, rise, dxscale) def render_string_horizontal(self, seq, matrix, point, font, fontsize, scaling, charspace, wordspace, rise, dxscale): (x,y) = point needcharspace = False for obj in seq: if isinstance(obj, (int, float)): x -= obj*dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: x += charspace x += self.render_char(translate_matrix(matrix, (x,y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, point, font, fontsize, scaling, charspace, wordspace, rise, dxscale): (x,y) = point needcharspace = False for obj in seq: if isinstance(obj, (int, float)): y -= obj*dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: y += charspace y += self.render_char(translate_matrix(matrix, (x,y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) def render_char(self, matrix, font, fontsize, scaling, rise, cid): return 0 class TagExtractor(PDFDevice): def __init__(self, rsrcmgr, outfp): PDFDevice.__init__(self, rsrcmgr) self.outfp = outfp self.pageno = 0 self._stack = [] def render_string(self, textstate, seq): font = textstate.font text = '' for obj in seq: if not isinstance(obj, str): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: pass self.outfp.write(htmlescape(text, self.outfp.encoding)) def begin_page(self, page, ctm): self.outfp.write('' % (self.pageno, bbox2str(page.mediabox), page.rotate)) def end_page(self, page): self.outfp.write('\n') self.pageno += 1 def begin_tag(self, tag, props=None): s = '' if isinstance(props, dict): s = ''.join( ' %s="%s"' % (htmlescape(k), htmlescape(str(v))) for (k,v) in sorted(props.items()) ) self.outfp.write('<%s%s>' % (htmlescape(tag.name), s)) self._stack.append(tag) def end_tag(self): assert self._stack tag = self._stack.pop(-1) self.outfp.write('' % htmlescape(tag.name)) def do_tag(self, tag, props=None): self.begin_tag(tag, props) self._stack.pop(-1)