#!/usr/bin/env python3

""" Adobe character mapping (CMap) support.

CMaps provide the mapping between character codes and Unicode
code-points to character ids (CIDs).

More information is available on the Adobe website:

  http://opensource.adobe.com/wiki/display/cmap/CMap+Resources

"""

import sys
import os
import os.path
import gzip
import pickle as pickle
import struct
import logging

from . import cmap
from .psparser import PSStackParser
from .psparser import PSSyntaxError, PSEOF
from .psparser import PSLiteral
from .psparser import literal_name
from .encodingdb import name2unicode
from .utils import choplist, nunpack


logger = logging.getLogger(__name__)


class CMapError(Exception): pass


class CMap:
    
    def __init__(self, code2cid=None):
        self.code2cid = code2cid or {}

    def is_vertical(self):
        return False

    def use_cmap(self, cmap):
        assert isinstance(cmap, CMap)
        def copy(dst, src):
            for (k,v) in src.items():
                if isinstance(v, dict):
                    d = {}
                    dst[k] = d
                    copy(d, v)
                else:
                    dst[k] = v
        copy(self.code2cid, cmap.code2cid)

    def decode(self, code):
        logger.debug('decode: %r, %r', self, code)
        if isinstance(code, str):
            code = code.encode('latin-1')
        d = self.code2cid
        for c in code:
            if c in d:
                d = d[c]
                if isinstance(d, int):
                    yield d
                    d = self.code2cid
            else:
                d = self.code2cid

    def dump(self, out=sys.stdout, code2cid=None, code=None):
        if code2cid is None:
            code2cid = self.code2cid
            code = ()
        for (k,v) in sorted(code2cid.items()):
            c = code+(k,)
            if isinstance(v, int):
                out.write('code %r = cid %d\n' % (c,v))
            else:
                self.dump(out=out, code2cid=v, code=c)
    

class IdentityCMap:

    def __init__(self, vertical):
        self.vertical = vertical

    def is_vertical(self):
        return self.vertical

    def decode(self, code):
        if isinstance(code, str):
            code = code.encode('latin-1')
        if len(code) % 2 != 0:
            # Something's wrong, but we have to at least prevent a crash by removing the last char
            logger.warning("The code %r has an uneven length, trimming last byte.", code)
            code = code[:-1]
        n = len(code)//2
        if n:
            return struct.unpack('>%dH' % n, code)
        else:
            return ()
        
            
class UnicodeMap:
    
    def __init__(self, cid2unichr=None):
        self.cid2unichr = cid2unichr or {}

    def get_unichr(self, cid):
        logger.debug('get_unichr: %r, %r', self, cid)
        return self.cid2unichr[cid]

    def dump(self, out=sys.stdout):
        for (k,v) in sorted(self.cid2unichr.items()):
            out.write('cid %d = unicode %r\n' % (k,v))


class FileCMap(CMap):

    def __init__(self):
        CMap.__init__(self)
        self.attrs = {}

    def __repr__(self):
        return '<CMap: %s>' % self.attrs.get('CMapName')

    def is_vertical(self):
        return self.attrs.get('WMode', 0) != 0

    def set_attr(self, k, v):
        self.attrs[k] = v

    def add_code2cid(self, code, cid):
        assert isinstance(code, str) and isinstance(cid, int)
        d = self.code2cid
        for c in code[:-1]:
            c = ord(c)
            if c in d:
                d = d[c]
            else:
                t = {}
                d[c] = t
                d =t
        c = ord(code[-1])
        d[c] = cid


class FileUnicodeMap(UnicodeMap):
    
    def __init__(self):
        UnicodeMap.__init__(self)
        self.attrs = {}

    def __repr__(self):
        return '<UnicodeMap: %s>' % self.attrs.get('CMapName')

    def set_attr(self, k, v):
        self.attrs[k] = v

    def add_cid2unichr(self, cid, code):
        assert isinstance(cid, int)
        if isinstance(code, str):
            # Interpret the contents of the string as bytes, and decode it as if it was bytes
            code = code.encode('latin-1')
        if isinstance(code, PSLiteral):
            # Interpret as an Adobe glyph name.
            self.cid2unichr[cid] = name2unicode(code.name)
        elif isinstance(code, bytes):
            # Interpret as UTF-16BE.
            self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
        elif isinstance(code, int):
            self.cid2unichr[cid] = chr(code)
        else:
            raise TypeError(repr(code))


class PyCMap(CMap):

    def __init__(self, name, module):
        CMap.__init__(self, module.CODE2CID)
        self.name = name
        self._is_vertical = module.IS_VERTICAL

    def __repr__(self):
        return '<PyCMap: %s>' % (self.name)

    def is_vertical(self):
        return self._is_vertical
    

class PyUnicodeMap(UnicodeMap):
    
    def __init__(self, name, module, vertical):
        if vertical:
            cid2unichr = module.CID2UNICHR_V
        else:
            cid2unichr = module.CID2UNICHR_H
        UnicodeMap.__init__(self, cid2unichr)
        self.name = name

    def __repr__(self):
        return '<PyUnicodeMap: %s>' % (self.name)


class CMapDB:

    _cmap_cache = {}
    _umap_cache = {}
    
    class CMapNotFound(CMapError): pass

    @classmethod
    def _load_data(klass, name):
        filename = '%s.pickle.gz' % name
        logger.debug('loading: %s', name)
        default_path = os.environ.get('CMAP_PATH', '/usr/share/pdfminer/')
        for directory in (os.path.dirname(cmap.__file__), default_path):
            path = os.path.join(directory, filename)
            if os.path.exists(path):
                gzfile = gzip.open(path)
                try:
                    return type(name, (), pickle.loads(gzfile.read()))
                finally:
                    gzfile.close()
        else:
            raise CMapDB.CMapNotFound(name)

    @classmethod
    def get_cmap(klass, name):
        if name == 'Identity-H':
            return IdentityCMap(False)
        elif name == 'Identity-V':
            return IdentityCMap(True)
        try:
            return klass._cmap_cache[name]
        except KeyError:
            pass
        data = klass._load_data(name)
        klass._cmap_cache[name] = cmap = PyCMap(name, data)
        return cmap

    @classmethod
    def get_unicode_map(klass, name, vertical=False):
        try:
            return klass._umap_cache[name][vertical]
        except KeyError:
            pass
        data = klass._load_data('to-unicode-%s' % name)
        klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
        return umaps[vertical]


class CMapParser(PSStackParser):

    def __init__(self, cmap, fp):
        PSStackParser.__init__(self, fp)
        self.cmap = cmap
        self._in_cmap = False

    def run(self):
        try:
            self.nextobject()
        except PSEOF:
            pass

    def do_keyword(self, pos, token):
        name = token.name
        if name == 'begincmap':
            self._in_cmap = True
            self.popall()
            return
        elif name == 'endcmap':
            self._in_cmap = False
            return
        if not self._in_cmap:
            return
        
        if name == 'def':
            try:
                ((_,k),(_,v)) = self.pop(2)
                self.cmap.set_attr(literal_name(k), v)
            except PSSyntaxError:
                pass
            return

        if name == 'usecmap':
            try:
                ((_,cmapname),) = self.pop(1)
                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
            except PSSyntaxError:
                pass
            except CMapDB.CMapNotFound:
                pass
            return

        if name == 'begincodespacerange':
            self.popall()
            return
        if name == 'endcodespacerange':
            self.popall()
            return

        if name == 'begincidrange':
            self.popall()
            return
        if name == 'endcidrange':
            objs = [ obj for (_,obj) in self.popall() ]
            for (s,e,cid) in choplist(3, objs):
                if (not isinstance(s, str) or not isinstance(e, str) or
                    not isinstance(cid, int) or len(s) != len(e)): continue
                sprefix = s[:-4]
                eprefix = e[:-4]
                if sprefix != eprefix: continue
                svar = s[-4:]
                evar = e[-4:]
                s1 = nunpack(svar)
                e1 = nunpack(evar)
                vlen = len(svar)
                #assert s1 <= e1
                for i in range(e1-s1+1):
                    x = sprefix+struct.pack('>L',s1+i)[-vlen:]
                    self.cmap.add_code2cid(x, cid+i)
            return

        if name == 'begincidchar':
            self.popall()
            return
        if name == 'endcidchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(code, str) and isinstance(cid, str):
                    self.cmap.add_code2cid(code, nunpack(cid))
            return

        if name == 'beginbfrange':
            self.popall()
            return
        if name == 'endbfrange':
            objs = [ obj for (_,obj) in self.popall() ]
            # These objects were hex numbers and have been parsed into a string. But what we want
            # are bytes. Convert them.
            # Oh wait, it seems that sometimes we have bytes...
            tobytes = lambda o: (o.encode('ascii') if isinstance(o, str) else o)
            objs = [tobytes(o) for o in objs]
            for (s,e,code) in choplist(3, objs):
                if (not isinstance(s, bytes) or not isinstance(e, bytes) or
                    len(s) != len(e)): continue
                s1 = nunpack(s)
                e1 = nunpack(e)
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in range(e1-s1+1):
                        self.cmap.add_cid2unichr(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
                    prefix = code[:-4]
                    vlen = len(var)
                    for i in range(e1-s1+1):
                        x = prefix+struct.pack('>L',base+i)[-vlen:]
                        self.cmap.add_cid2unichr(s1+i, x)
            return

        if name == 'beginbfchar':
            self.popall()
            return
        if name == 'endbfchar':
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, (str, bytes)) and isinstance(code, (str, bytes)):
                    self.cmap.add_cid2unichr(nunpack(cid), code)
            return

        if name == 'beginnotdefrange':
            self.popall()
            return
        if name == 'endnotdefrange':
            self.popall()
            return

        self.push((pos, token))

# test
def main(argv):
    args = argv[1:]
    for fname in args:
        fp = open(fname, 'rb')
        cmap = FileUnicodeMap()
        #cmap = FileCMap()
        CMapParser(cmap, fp).run()
        fp.close()
        cmap.dump()

if __name__ == '__main__':
    sys.exit(main(sys.argv))