You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

397 lines
11 KiB

4 years ago
  1. #!/usr/bin/env python3
  2. """ Adobe character mapping (CMap) support.
  3. CMaps provide the mapping between character codes and Unicode
  4. code-points to character ids (CIDs).
  5. More information is available on the Adobe website:
  6. http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
  7. """
  8. import sys
  9. import os
  10. import os.path
  11. import gzip
  12. import pickle as pickle
  13. import struct
  14. import logging
  15. from . import cmap
  16. from .psparser import PSStackParser
  17. from .psparser import PSSyntaxError, PSEOF
  18. from .psparser import PSLiteral
  19. from .psparser import literal_name
  20. from .encodingdb import name2unicode
  21. from .utils import choplist, nunpack
  22. logger = logging.getLogger(__name__)
  23. class CMapError(Exception): pass
  24. class CMap:
  25. def __init__(self, code2cid=None):
  26. self.code2cid = code2cid or {}
  27. def is_vertical(self):
  28. return False
  29. def use_cmap(self, cmap):
  30. assert isinstance(cmap, CMap)
  31. def copy(dst, src):
  32. for (k,v) in src.items():
  33. if isinstance(v, dict):
  34. d = {}
  35. dst[k] = d
  36. copy(d, v)
  37. else:
  38. dst[k] = v
  39. copy(self.code2cid, cmap.code2cid)
  40. def decode(self, code):
  41. logger.debug('decode: %r, %r', self, code)
  42. if isinstance(code, str):
  43. code = code.encode('latin-1')
  44. d = self.code2cid
  45. for c in code:
  46. if c in d:
  47. d = d[c]
  48. if isinstance(d, int):
  49. yield d
  50. d = self.code2cid
  51. else:
  52. d = self.code2cid
  53. def dump(self, out=sys.stdout, code2cid=None, code=None):
  54. if code2cid is None:
  55. code2cid = self.code2cid
  56. code = ()
  57. for (k,v) in sorted(code2cid.items()):
  58. c = code+(k,)
  59. if isinstance(v, int):
  60. out.write('code %r = cid %d\n' % (c,v))
  61. else:
  62. self.dump(out=out, code2cid=v, code=c)
  63. class IdentityCMap:
  64. def __init__(self, vertical):
  65. self.vertical = vertical
  66. def is_vertical(self):
  67. return self.vertical
  68. def decode(self, code):
  69. if isinstance(code, str):
  70. code = code.encode('latin-1')
  71. if len(code) % 2 != 0:
  72. # Something's wrong, but we have to at least prevent a crash by removing the last char
  73. logger.warning("The code %r has an uneven length, trimming last byte.", code)
  74. code = code[:-1]
  75. n = len(code)//2
  76. if n:
  77. return struct.unpack('>%dH' % n, code)
  78. else:
  79. return ()
  80. class UnicodeMap:
  81. def __init__(self, cid2unichr=None):
  82. self.cid2unichr = cid2unichr or {}
  83. def get_unichr(self, cid):
  84. logger.debug('get_unichr: %r, %r', self, cid)
  85. return self.cid2unichr[cid]
  86. def dump(self, out=sys.stdout):
  87. for (k,v) in sorted(self.cid2unichr.items()):
  88. out.write('cid %d = unicode %r\n' % (k,v))
  89. class FileCMap(CMap):
  90. def __init__(self):
  91. CMap.__init__(self)
  92. self.attrs = {}
  93. def __repr__(self):
  94. return '<CMap: %s>' % self.attrs.get('CMapName')
  95. def is_vertical(self):
  96. return self.attrs.get('WMode', 0) != 0
  97. def set_attr(self, k, v):
  98. self.attrs[k] = v
  99. def add_code2cid(self, code, cid):
  100. assert isinstance(code, str) and isinstance(cid, int)
  101. d = self.code2cid
  102. for c in code[:-1]:
  103. c = ord(c)
  104. if c in d:
  105. d = d[c]
  106. else:
  107. t = {}
  108. d[c] = t
  109. d =t
  110. c = ord(code[-1])
  111. d[c] = cid
  112. class FileUnicodeMap(UnicodeMap):
  113. def __init__(self):
  114. UnicodeMap.__init__(self)
  115. self.attrs = {}
  116. def __repr__(self):
  117. return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
  118. def set_attr(self, k, v):
  119. self.attrs[k] = v
  120. def add_cid2unichr(self, cid, code):
  121. assert isinstance(cid, int)
  122. if isinstance(code, str):
  123. # Interpret the contents of the string as bytes, and decode it as if it was bytes
  124. code = code.encode('latin-1')
  125. if isinstance(code, PSLiteral):
  126. # Interpret as an Adobe glyph name.
  127. self.cid2unichr[cid] = name2unicode(code.name)
  128. elif isinstance(code, bytes):
  129. # Interpret as UTF-16BE.
  130. self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
  131. elif isinstance(code, int):
  132. self.cid2unichr[cid] = chr(code)
  133. else:
  134. raise TypeError(repr(code))
  135. class PyCMap(CMap):
  136. def __init__(self, name, module):
  137. CMap.__init__(self, module.CODE2CID)
  138. self.name = name
  139. self._is_vertical = module.IS_VERTICAL
  140. def __repr__(self):
  141. return '<PyCMap: %s>' % (self.name)
  142. def is_vertical(self):
  143. return self._is_vertical
  144. class PyUnicodeMap(UnicodeMap):
  145. def __init__(self, name, module, vertical):
  146. if vertical:
  147. cid2unichr = module.CID2UNICHR_V
  148. else:
  149. cid2unichr = module.CID2UNICHR_H
  150. UnicodeMap.__init__(self, cid2unichr)
  151. self.name = name
  152. def __repr__(self):
  153. return '<PyUnicodeMap: %s>' % (self.name)
  154. class CMapDB:
  155. _cmap_cache = {}
  156. _umap_cache = {}
  157. class CMapNotFound(CMapError): pass
  158. @classmethod
  159. def _load_data(klass, name):
  160. filename = '%s.pickle.gz' % name
  161. logger.debug('loading: %s', name)
  162. default_path = os.environ.get('CMAP_PATH', '/usr/share/pdfminer/')
  163. for directory in (os.path.dirname(cmap.__file__), default_path):
  164. path = os.path.join(directory, filename)
  165. if os.path.exists(path):
  166. gzfile = gzip.open(path)
  167. try:
  168. return type(name, (), pickle.loads(gzfile.read()))
  169. finally:
  170. gzfile.close()
  171. else:
  172. raise CMapDB.CMapNotFound(name)
  173. @classmethod
  174. def get_cmap(klass, name):
  175. if name == 'Identity-H':
  176. return IdentityCMap(False)
  177. elif name == 'Identity-V':
  178. return IdentityCMap(True)
  179. try:
  180. return klass._cmap_cache[name]
  181. except KeyError:
  182. pass
  183. data = klass._load_data(name)
  184. klass._cmap_cache[name] = cmap = PyCMap(name, data)
  185. return cmap
  186. @classmethod
  187. def get_unicode_map(klass, name, vertical=False):
  188. try:
  189. return klass._umap_cache[name][vertical]
  190. except KeyError:
  191. pass
  192. data = klass._load_data('to-unicode-%s' % name)
  193. klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
  194. return umaps[vertical]
  195. class CMapParser(PSStackParser):
  196. def __init__(self, cmap, fp):
  197. PSStackParser.__init__(self, fp)
  198. self.cmap = cmap
  199. self._in_cmap = False
  200. def run(self):
  201. try:
  202. self.nextobject()
  203. except PSEOF:
  204. pass
  205. def do_keyword(self, pos, token):
  206. name = token.name
  207. if name == 'begincmap':
  208. self._in_cmap = True
  209. self.popall()
  210. return
  211. elif name == 'endcmap':
  212. self._in_cmap = False
  213. return
  214. if not self._in_cmap:
  215. return
  216. if name == 'def':
  217. try:
  218. ((_,k),(_,v)) = self.pop(2)
  219. self.cmap.set_attr(literal_name(k), v)
  220. except PSSyntaxError:
  221. pass
  222. return
  223. if name == 'usecmap':
  224. try:
  225. ((_,cmapname),) = self.pop(1)
  226. self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
  227. except PSSyntaxError:
  228. pass
  229. except CMapDB.CMapNotFound:
  230. pass
  231. return
  232. if name == 'begincodespacerange':
  233. self.popall()
  234. return
  235. if name == 'endcodespacerange':
  236. self.popall()
  237. return
  238. if name == 'begincidrange':
  239. self.popall()
  240. return
  241. if name == 'endcidrange':
  242. objs = [ obj for (_,obj) in self.popall() ]
  243. for (s,e,cid) in choplist(3, objs):
  244. if (not isinstance(s, str) or not isinstance(e, str) or
  245. not isinstance(cid, int) or len(s) != len(e)): continue
  246. sprefix = s[:-4]
  247. eprefix = e[:-4]
  248. if sprefix != eprefix: continue
  249. svar = s[-4:]
  250. evar = e[-4:]
  251. s1 = nunpack(svar)
  252. e1 = nunpack(evar)
  253. vlen = len(svar)
  254. #assert s1 <= e1
  255. for i in range(e1-s1+1):
  256. x = sprefix+struct.pack('>L',s1+i)[-vlen:]
  257. self.cmap.add_code2cid(x, cid+i)
  258. return
  259. if name == 'begincidchar':
  260. self.popall()
  261. return
  262. if name == 'endcidchar':
  263. objs = [ obj for (_,obj) in self.popall() ]
  264. for (cid,code) in choplist(2, objs):
  265. if isinstance(code, str) and isinstance(cid, str):
  266. self.cmap.add_code2cid(code, nunpack(cid))
  267. return
  268. if name == 'beginbfrange':
  269. self.popall()
  270. return
  271. if name == 'endbfrange':
  272. objs = [ obj for (_,obj) in self.popall() ]
  273. # These objects were hex numbers and have been parsed into a string. But what we want
  274. # are bytes. Convert them.
  275. # Oh wait, it seems that sometimes we have bytes...
  276. tobytes = lambda o: (o.encode('ascii') if isinstance(o, str) else o)
  277. objs = [tobytes(o) for o in objs]
  278. for (s,e,code) in choplist(3, objs):
  279. if (not isinstance(s, bytes) or not isinstance(e, bytes) or
  280. len(s) != len(e)): continue
  281. s1 = nunpack(s)
  282. e1 = nunpack(e)
  283. #assert s1 <= e1
  284. if isinstance(code, list):
  285. for i in range(e1-s1+1):
  286. self.cmap.add_cid2unichr(s1+i, code[i])
  287. else:
  288. var = code[-4:]
  289. base = nunpack(var)
  290. prefix = code[:-4]
  291. vlen = len(var)
  292. for i in range(e1-s1+1):
  293. x = prefix+struct.pack('>L',base+i)[-vlen:]
  294. self.cmap.add_cid2unichr(s1+i, x)
  295. return
  296. if name == 'beginbfchar':
  297. self.popall()
  298. return
  299. if name == 'endbfchar':
  300. objs = [ obj for (_,obj) in self.popall() ]
  301. for (cid,code) in choplist(2, objs):
  302. if isinstance(cid, (str, bytes)) and isinstance(code, (str, bytes)):
  303. self.cmap.add_cid2unichr(nunpack(cid), code)
  304. return
  305. if name == 'beginnotdefrange':
  306. self.popall()
  307. return
  308. if name == 'endnotdefrange':
  309. self.popall()
  310. return
  311. self.push((pos, token))
  312. # test
  313. def main(argv):
  314. args = argv[1:]
  315. for fname in args:
  316. fp = open(fname, 'rb')
  317. cmap = FileUnicodeMap()
  318. #cmap = FileCMap()
  319. CMapParser(cmap, fp).run()
  320. fp.close()
  321. cmap.dump()
  322. if __name__ == '__main__':
  323. sys.exit(main(sys.argv))