You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

675 lines
25 KiB

4 years ago
  1. #!/usr/bin/env python3
  2. import sys
  3. import io
  4. import struct
  5. from .cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
  6. from .encodingdb import EncodingDB, name2unicode
  7. from .psparser import PSStackParser
  8. from .psparser import PSEOF
  9. from .psparser import LIT, KWD, handle_error
  10. from .psparser import PSLiteral, literal_name
  11. from .pdftypes import (PDFException, resolve1, int_value, num_value, list_value, dict_value,
  12. stream_value)
  13. from .fontmetrics import FONT_METRICS
  14. from .utils import apply_matrix_norm, nunpack, choplist
  15. def get_widths(seq):
  16. widths = {}
  17. r = []
  18. for v in seq:
  19. if isinstance(v, list):
  20. if r:
  21. char1 = r[-1]
  22. for (i,w) in enumerate(v):
  23. widths[char1+i] = w
  24. r = []
  25. elif isinstance(v, int):
  26. r.append(v)
  27. if len(r) == 3:
  28. (char1,char2,w) = r
  29. for i in range(char1, char2+1):
  30. widths[i] = w
  31. r = []
  32. return widths
  33. #assert get_widths([1]) == {}
  34. #assert get_widths([1,2,3]) == {1:3, 2:3}
  35. #assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
  36. def get_widths2(seq):
  37. widths = {}
  38. r = []
  39. for v in seq:
  40. if isinstance(v, list):
  41. if r:
  42. char1 = r[-1]
  43. for (i,(w,vx,vy)) in enumerate(choplist(3,v)):
  44. widths[char1+i] = (w,(vx,vy))
  45. r = []
  46. elif isinstance(v, int):
  47. r.append(v)
  48. if len(r) == 5:
  49. (char1,char2,w,vx,vy) = r
  50. for i in range(char1, char2+1):
  51. widths[i] = (w,(vx,vy))
  52. r = []
  53. return widths
  54. #assert get_widths2([1]) == {}
  55. #assert get_widths2([1,2,3,4,5]) == {1:(3,(4,5)), 2:(3,(4,5))}
  56. #assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2,(3,4)), 6:(7,(8,9))}
  57. class FontMetricsDB:
  58. @classmethod
  59. def get_metrics(klass, fontname):
  60. return FONT_METRICS[fontname]
  61. class Type1FontHeaderParser(PSStackParser):
  62. KEYWORD_BEGIN = KWD('begin')
  63. KEYWORD_END = KWD('end')
  64. KEYWORD_DEF = KWD('def')
  65. KEYWORD_PUT = KWD('put')
  66. KEYWORD_DICT = KWD('dict')
  67. KEYWORD_ARRAY = KWD('array')
  68. KEYWORD_READONLY = KWD('readonly')
  69. KEYWORD_FOR = KWD('for')
  70. KEYWORD_FOR = KWD('for')
  71. def __init__(self, data):
  72. PSStackParser.__init__(self, data)
  73. self._cid2unicode = {}
  74. def get_encoding(self):
  75. while 1:
  76. try:
  77. (cid,name) = self.nextobject()
  78. except PSEOF:
  79. break
  80. try:
  81. self._cid2unicode[cid] = name2unicode(name)
  82. except KeyError:
  83. pass
  84. return self._cid2unicode
  85. def do_keyword(self, pos, token):
  86. if token is self.KEYWORD_PUT:
  87. ((_,key),(_,value)) = self.pop(2)
  88. if (isinstance(key, int) and
  89. isinstance(value, PSLiteral)):
  90. self.add_results((key, literal_name(value)))
  91. ## CFFFont
  92. ## (Format specified in Adobe Technical Note: #5176
  93. ## "The Compact Font Format Specification")
  94. ##
  95. NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
  96. def getdict(data):
  97. d = {}
  98. fp = io.BytesIO(data)
  99. stack = []
  100. while 1:
  101. c = fp.read(1)
  102. if not c: break
  103. b0 = ord(c)
  104. if b0 <= 21:
  105. d[b0] = stack
  106. stack = []
  107. continue
  108. if b0 == 30:
  109. s = ''
  110. loop = True
  111. while loop:
  112. b = ord(fp.read(1))
  113. for n in (b >> 4, b & 15):
  114. if n == 15:
  115. loop = False
  116. else:
  117. s += NIBBLES[n]
  118. value = float(s)
  119. elif 32 <= b0 and b0 <= 246:
  120. value = b0-139
  121. else:
  122. b1 = ord(fp.read(1))
  123. if 247 <= b0 and b0 <= 250:
  124. value = ((b0-247)<<8)+b1+108
  125. elif 251 <= b0 and b0 <= 254:
  126. value = -((b0-251)<<8)-b1-108
  127. else:
  128. b2 = ord(fp.read(1))
  129. if 128 <= b1: b1 -= 256
  130. if b0 == 28:
  131. value = b1<<8 | b2
  132. else:
  133. value = b1<<24 | b2<<16 | struct.unpack('>H', fp.read(2))[0]
  134. stack.append(value)
  135. return d
  136. class CFFFont:
  137. STANDARD_STRINGS = (
  138. '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
  139. 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
  140. 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
  141. 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
  142. 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
  143. 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
  144. 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
  145. 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
  146. 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
  147. 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  148. 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  149. 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
  150. 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
  151. 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
  152. 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
  153. 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
  154. 'quotesinglbase', 'quotedblbase', 'quotedblright',
  155. 'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
  156. 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
  157. 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
  158. 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
  159. 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
  160. 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
  161. 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
  162. 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
  163. 'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
  164. 'multiply', 'threesuperior', 'copyright', 'Aacute',
  165. 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
  166. 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
  167. 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
  168. 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
  169. 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
  170. 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
  171. 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
  172. 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
  173. 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
  174. 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
  175. 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
  176. 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
  177. 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
  178. 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
  179. 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
  180. 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
  181. 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
  182. 'commasuperior', 'threequartersemdash', 'periodsuperior',
  183. 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
  184. 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
  185. 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
  186. 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
  187. 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
  188. 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
  189. 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
  190. 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
  191. 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
  192. 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
  193. 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
  194. 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
  195. 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
  196. 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
  197. 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
  198. 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
  199. 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
  200. 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
  201. 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
  202. 'seveninferior', 'eightinferior', 'nineinferior',
  203. 'centinferior', 'dollarinferior', 'periodinferior',
  204. 'commainferior', 'Agravesmall', 'Aacutesmall',
  205. 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
  206. 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
  207. 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
  208. 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
  209. 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
  210. 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
  211. 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
  212. 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
  213. 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
  214. '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
  215. 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
  216. )
  217. class INDEX:
  218. def __init__(self, fp):
  219. self.fp = fp
  220. self.offsets = []
  221. (count, offsize) = struct.unpack(b'>HB', self.fp.read(3))
  222. for i in range(count+1):
  223. self.offsets.append(nunpack(self.fp.read(offsize)))
  224. self.base = self.fp.tell()-1
  225. self.fp.seek(self.base+self.offsets[-1])
  226. def __repr__(self):
  227. return '<INDEX: size=%d>' % len(self)
  228. def __len__(self):
  229. return len(self.offsets)-1
  230. def __getitem__(self, i):
  231. self.fp.seek(self.base+self.offsets[i])
  232. return self.fp.read(self.offsets[i+1]-self.offsets[i])
  233. def __iter__(self):
  234. return iter( self[i] for i in range(len(self)) )
  235. def __init__(self, name, fp):
  236. self.name = name
  237. self.fp = fp
  238. # Header
  239. (_major,_minor,hdrsize,offsize) = struct.unpack(b'BBBB', self.fp.read(4))
  240. self.fp.read(hdrsize-4)
  241. # Name INDEX
  242. self.name_index = self.INDEX(self.fp)
  243. # Top DICT INDEX
  244. self.dict_index = self.INDEX(self.fp)
  245. # String INDEX
  246. self.string_index = self.INDEX(self.fp)
  247. # Global Subr INDEX
  248. self.subr_index = self.INDEX(self.fp)
  249. # Top DICT DATA
  250. self.top_dict = getdict(self.dict_index[0])
  251. (charset_pos,) = self.top_dict.get(15, [0])
  252. (encoding_pos,) = self.top_dict.get(16, [0])
  253. (charstring_pos,) = self.top_dict.get(17, [0])
  254. # CharStrings
  255. self.fp.seek(charstring_pos)
  256. self.charstring = self.INDEX(self.fp)
  257. self.nglyphs = len(self.charstring)
  258. # Encodings
  259. self.code2gid = {}
  260. self.gid2code = {}
  261. self.fp.seek(encoding_pos)
  262. format = self.fp.read(1)
  263. if format == b'\x00':
  264. # Format 0
  265. (n,) = struct.unpack(b'B', self.fp.read(1))
  266. for (code,gid) in enumerate(struct.unpack(b'B'*n, self.fp.read(n))):
  267. self.code2gid[code] = gid
  268. self.gid2code[gid] = code
  269. elif format == b'\x01':
  270. # Format 1
  271. (n,) = struct.unpack(b'B', self.fp.read(1))
  272. code = 0
  273. for i in range(n):
  274. (first,nleft) = struct.unpack(b'BB', self.fp.read(2))
  275. for gid in range(first,first+nleft+1):
  276. self.code2gid[code] = gid
  277. self.gid2code[gid] = code
  278. code += 1
  279. else:
  280. raise ValueError('unsupported encoding format: %r' % format)
  281. # Charsets
  282. self.name2gid = {}
  283. self.gid2name = {}
  284. self.fp.seek(charset_pos)
  285. format = self.fp.read(1)
  286. if format == '\x00':
  287. # Format 0
  288. n = self.nglyphs-1
  289. for (gid,sid) in enumerate(struct.unpack(b'>'+b'H'*n, self.fp.read(2*n))):
  290. gid += 1
  291. name = self.getstr(sid)
  292. self.name2gid[name] = gid
  293. self.gid2name[gid] = name
  294. elif format == '\x01':
  295. # Format 1
  296. (n,) = struct.unpack(b'B', self.fp.read(1))
  297. sid = 0
  298. for i in range(n):
  299. (first,nleft) = struct.unpack(b'BB', self.fp.read(2))
  300. for gid in range(first,first+nleft+1):
  301. name = self.getstr(sid)
  302. self.name2gid[name] = gid
  303. self.gid2name[gid] = name
  304. sid += 1
  305. elif format == '\x02':
  306. # Format 2
  307. assert 0
  308. else:
  309. raise ValueError('unsupported charset format: %r' % format)
  310. #print self.code2gid
  311. #print self.name2gid
  312. #assert 0
  313. def getstr(self, sid):
  314. if sid < len(self.STANDARD_STRINGS):
  315. return self.STANDARD_STRINGS[sid]
  316. return self.string_index[sid-len(self.STANDARD_STRINGS)]
  317. class TrueTypeFont:
  318. class CMapNotFound(Exception): pass
  319. def __init__(self, name, fp):
  320. self.name = name
  321. self.fp = fp
  322. self.tables = {}
  323. self.fonttype = fp.read(4)
  324. (ntables, _1, _2, _3) = struct.unpack(b'>HHHH', fp.read(8))
  325. for _ in range(ntables):
  326. (name, tsum, offset, length) = struct.unpack(b'>4sLLL', fp.read(16))
  327. self.tables[name] = (offset, length)
  328. def create_unicode_map(self):
  329. if 'cmap' not in self.tables:
  330. raise TrueTypeFont.CMapNotFound
  331. (base_offset, length) = self.tables['cmap']
  332. fp = self.fp
  333. fp.seek(base_offset)
  334. (version, nsubtables) = struct.unpack(b'>HH', fp.read(4))
  335. subtables = []
  336. for i in range(nsubtables):
  337. subtables.append(struct.unpack(b'>HHL', fp.read(8)))
  338. char2gid = {}
  339. # Only supports subtable type 0, 2 and 4.
  340. for (_1, _2, st_offset) in subtables:
  341. fp.seek(base_offset+st_offset)
  342. (fmttype, fmtlen, fmtlang) = struct.unpack(b'>HHH', fp.read(6))
  343. if fmttype == 0:
  344. char2gid.update(enumerate(struct.unpack(b'>256B', fp.read(256))))
  345. elif fmttype == 2:
  346. subheaderkeys = struct.unpack(b'>256H', fp.read(512))
  347. firstbytes = [0]*8192
  348. for (i,k) in enumerate(subheaderkeys):
  349. firstbytes[k/8] = i
  350. nhdrs = max(subheaderkeys)/8 + 1
  351. hdrs = []
  352. for i in range(nhdrs):
  353. (firstcode,entcount,delta,offset) = struct.unpack(b'>HHhH', fp.read(8))
  354. hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
  355. for (i,firstcode,entcount,delta,pos) in hdrs:
  356. if not entcount: continue
  357. first = firstcode + (firstbytes[i] << 8)
  358. fp.seek(pos)
  359. for c in range(entcount):
  360. gid = struct.unpack(b'>H', fp.read(2))
  361. if gid:
  362. gid += delta
  363. char2gid[first+c] = gid
  364. elif fmttype == 4:
  365. (segcount, _1, _2, _3) = struct.unpack(b'>HHHH', fp.read(8))
  366. segcount /= 2
  367. ecs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount))
  368. fp.read(2)
  369. scs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount))
  370. idds = struct.unpack(b'>%dh' % segcount, fp.read(2*segcount))
  371. pos = fp.tell()
  372. idrs = struct.unpack(b'>%dH' % segcount, fp.read(2*segcount))
  373. for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
  374. if idr:
  375. fp.seek(pos+idr)
  376. for c in range(sc, ec+1):
  377. char2gid[c] = (struct.unpack(b'>H', fp.read(2))[0] + idd) & 0xffff
  378. else:
  379. for c in range(sc, ec+1):
  380. char2gid[c] = (c + idd) & 0xffff
  381. else:
  382. assert 0
  383. # create unicode map
  384. unicode_map = FileUnicodeMap()
  385. for (char,gid) in char2gid.items():
  386. unicode_map.add_cid2unichr(gid, char)
  387. return unicode_map
  388. ## Fonts
  389. ##
  390. class PDFFontError(PDFException): pass
  391. class PDFUnicodeNotDefined(PDFFontError): pass
  392. LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
  393. LITERAL_TYPE1C = LIT('Type1C')
  394. class PDFFont:
  395. def __init__(self, descriptor, widths, default_width=None):
  396. self.descriptor = descriptor
  397. self.widths = widths
  398. self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
  399. if isinstance(self.fontname, PSLiteral):
  400. self.fontname = literal_name(self.fontname)
  401. self.flags = int_value(descriptor.get('Flags', 0))
  402. self.ascent = num_value(descriptor.get('Ascent', 0))
  403. self.descent = num_value(descriptor.get('Descent', 0))
  404. self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
  405. self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
  406. self.leading = num_value(descriptor.get('Leading', 0))
  407. self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
  408. self.hscale = self.vscale = .001
  409. def __repr__(self):
  410. return '<PDFFont>'
  411. def is_vertical(self):
  412. return False
  413. def is_multibyte(self):
  414. return False
  415. def decode(self, s):
  416. if isinstance(s, str):
  417. return list(map(ord, s))
  418. else: # it's already bytes
  419. return s
  420. def get_ascent(self):
  421. return self.ascent * self.vscale
  422. def get_descent(self):
  423. return self.descent * self.vscale
  424. def get_width(self):
  425. w = self.bbox[2]-self.bbox[0]
  426. if w == 0:
  427. w = -self.default_width
  428. return w * self.hscale
  429. def get_height(self):
  430. h = self.bbox[3]-self.bbox[1]
  431. if h == 0:
  432. h = self.ascent - self.descent
  433. return h * self.vscale
  434. def char_width(self, cid):
  435. return self.widths.get(cid, self.default_width) * self.hscale
  436. def char_disp(self, cid):
  437. return 0
  438. def string_width(self, s):
  439. return sum( self.char_width(cid) for cid in self.decode(s) )
  440. class PDFSimpleFont(PDFFont):
  441. def __init__(self, descriptor, widths, spec):
  442. # Font encoding is specified either by a name of
  443. # built-in encoding or a dictionary that describes
  444. # the differences.
  445. if 'Encoding' in spec:
  446. encoding = resolve1(spec['Encoding'])
  447. else:
  448. encoding = LITERAL_STANDARD_ENCODING
  449. if isinstance(encoding, dict):
  450. name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
  451. diff = list_value(encoding.get('Differences', None))
  452. self.cid2unicode = EncodingDB.get_encoding(name, diff)
  453. else:
  454. self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
  455. self.unicode_map = None
  456. if 'ToUnicode' in spec:
  457. strm = stream_value(spec['ToUnicode'])
  458. self.unicode_map = FileUnicodeMap()
  459. CMapParser(self.unicode_map, io.BytesIO(strm.get_data())).run()
  460. PDFFont.__init__(self, descriptor, widths)
  461. def to_unichr(self, cid):
  462. if self.unicode_map:
  463. try:
  464. return self.unicode_map.get_unichr(cid)
  465. except KeyError:
  466. pass
  467. try:
  468. return self.cid2unicode[cid]
  469. except KeyError:
  470. raise PDFUnicodeNotDefined(None, cid)
  471. class PDFType1Font(PDFSimpleFont):
  472. def __init__(self, rsrcmgr, spec):
  473. try:
  474. self.basefont = literal_name(spec['BaseFont'])
  475. except KeyError:
  476. handle_error(PDFFontError, 'BaseFont is missing')
  477. self.basefont = 'unknown'
  478. try:
  479. (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
  480. except KeyError:
  481. descriptor = dict_value(spec.get('FontDescriptor', {}))
  482. firstchar = int_value(spec.get('FirstChar', 0))
  483. lastchar = int_value(spec.get('LastChar', 255))
  484. widths = list_value(spec.get('Widths', [0]*256))
  485. widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
  486. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  487. if 'Encoding' not in spec and 'FontFile' in descriptor:
  488. # try to recover the missing encoding info from the font file.
  489. self.fontfile = stream_value(descriptor.get('FontFile'))
  490. length1 = int_value(self.fontfile['Length1'])
  491. data = self.fontfile.get_data()[:length1]
  492. parser = Type1FontHeaderParser(io.BytesIO(data))
  493. self.cid2unicode = parser.get_encoding()
  494. def __repr__(self):
  495. return '<PDFType1Font: basefont=%r>' % self.basefont
  496. class PDFTrueTypeFont(PDFType1Font):
  497. def __repr__(self):
  498. return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
  499. class PDFType3Font(PDFSimpleFont):
  500. def __init__(self, rsrcmgr, spec):
  501. firstchar = int_value(spec.get('FirstChar', 0))
  502. lastchar = int_value(spec.get('LastChar', 0))
  503. widths = list_value(spec.get('Widths', [0]*256))
  504. widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
  505. if 'FontDescriptor' in spec:
  506. descriptor = dict_value(spec['FontDescriptor'])
  507. else:
  508. descriptor = {'Ascent':0, 'Descent':0,
  509. 'FontBBox':spec['FontBBox']}
  510. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  511. self.matrix = tuple(list_value(spec.get('FontMatrix')))
  512. (_,self.descent,_,self.ascent) = self.bbox
  513. (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
  514. def __repr__(self):
  515. return '<PDFType3Font>'
  516. class PDFCIDFont(PDFFont):
  517. def __init__(self, rsrcmgr, spec):
  518. try:
  519. self.basefont = literal_name(spec['BaseFont'])
  520. except KeyError:
  521. handle_error(PDFFontError, 'BaseFont is missing')
  522. self.basefont = 'unknown'
  523. self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
  524. self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
  525. self.cidsysteminfo.get('Ordering', 'unknown'))
  526. try:
  527. name = literal_name(spec['Encoding'])
  528. except KeyError:
  529. handle_error(PDFFontError, 'Encoding is unspecified')
  530. name = 'unknown'
  531. try:
  532. self.cmap = CMapDB.get_cmap(name)
  533. except CMapDB.CMapNotFound as e:
  534. handle_error(PDFFontError, str(e))
  535. self.cmap = CMap()
  536. try:
  537. descriptor = dict_value(spec['FontDescriptor'])
  538. except KeyError:
  539. handle_error(PDFFontError, 'FontDescriptor is missing')
  540. descriptor = {}
  541. ttf = None
  542. if 'FontFile2' in descriptor:
  543. self.fontfile = stream_value(descriptor.get('FontFile2'))
  544. ttf = TrueTypeFont(self.basefont,
  545. io.BytesIO(self.fontfile.get_data()))
  546. self.unicode_map = None
  547. if 'ToUnicode' in spec:
  548. strm = stream_value(spec['ToUnicode'])
  549. self.unicode_map = FileUnicodeMap()
  550. CMapParser(self.unicode_map, io.BytesIO(strm.get_data())).run()
  551. elif self.cidcoding == 'Adobe-Identity':
  552. if ttf:
  553. try:
  554. self.unicode_map = ttf.create_unicode_map()
  555. except TrueTypeFont.CMapNotFound:
  556. pass
  557. else:
  558. try:
  559. self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
  560. except CMapDB.CMapNotFound as e:
  561. pass
  562. self.vertical = self.cmap.is_vertical()
  563. if self.vertical:
  564. # writing mode: vertical
  565. widths = get_widths2(list_value(spec.get('W2', [])))
  566. self.disps = dict( (cid,(vx,vy)) for (cid,(_,(vx,vy))) in widths.items() )
  567. (vy,w) = spec.get('DW2', [880, -1000])
  568. self.default_disp = (None,vy)
  569. widths = dict( (cid,w) for (cid,(w,_)) in widths.items() )
  570. default_width = w
  571. else:
  572. # writing mode: horizontal
  573. self.disps = {}
  574. self.default_disp = 0
  575. widths = get_widths(list_value(spec.get('W', [])))
  576. default_width = spec.get('DW', 1000)
  577. PDFFont.__init__(self, descriptor, widths, default_width=default_width)
  578. def __repr__(self):
  579. return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
  580. def is_vertical(self):
  581. return self.vertical
  582. def is_multibyte(self):
  583. return True
  584. def decode(self, bytes):
  585. return self.cmap.decode(bytes)
  586. def char_disp(self, cid):
  587. "Returns an integer for horizontal fonts, a tuple for vertical fonts."
  588. return self.disps.get(cid, self.default_disp)
  589. def to_unichr(self, cid):
  590. try:
  591. if not self.unicode_map:
  592. raise KeyError(cid)
  593. return self.unicode_map.get_unichr(cid)
  594. except KeyError:
  595. raise PDFUnicodeNotDefined(self.cidcoding, cid)
  596. def main(argv):
  597. for fname in argv[1:]:
  598. fp = io.open(fname, 'rb')
  599. #font = TrueTypeFont(fname, fp)
  600. font = CFFFont(fname, fp)
  601. print(font)
  602. fp.close()
  603. if __name__ == '__main__':
  604. sys.exit(main(sys.argv))