You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

399 lines
12 KiB

4 years ago
  1. from . import idnadata
  2. import bisect
  3. import unicodedata
  4. import re
  5. import sys
  6. from .intranges import intranges_contain
  7. _virama_combining_class = 9
  8. _alabel_prefix = b'xn--'
  9. _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]')
  10. if sys.version_info[0] == 3:
  11. unicode = str
  12. unichr = chr
  13. class IDNAError(UnicodeError):
  14. """ Base exception for all IDNA-encoding related problems """
  15. pass
  16. class IDNABidiError(IDNAError):
  17. """ Exception when bidirectional requirements are not satisfied """
  18. pass
  19. class InvalidCodepoint(IDNAError):
  20. """ Exception when a disallowed or unallocated codepoint is used """
  21. pass
  22. class InvalidCodepointContext(IDNAError):
  23. """ Exception when the codepoint is not valid in the context it is used """
  24. pass
  25. def _combining_class(cp):
  26. v = unicodedata.combining(unichr(cp))
  27. if v == 0:
  28. if not unicodedata.name(unichr(cp)):
  29. raise ValueError("Unknown character in unicodedata")
  30. return v
  31. def _is_script(cp, script):
  32. return intranges_contain(ord(cp), idnadata.scripts[script])
  33. def _punycode(s):
  34. return s.encode('punycode')
  35. def _unot(s):
  36. return 'U+{0:04X}'.format(s)
  37. def valid_label_length(label):
  38. if len(label) > 63:
  39. return False
  40. return True
  41. def valid_string_length(label, trailing_dot):
  42. if len(label) > (254 if trailing_dot else 253):
  43. return False
  44. return True
  45. def check_bidi(label, check_ltr=False):
  46. # Bidi rules should only be applied if string contains RTL characters
  47. bidi_label = False
  48. for (idx, cp) in enumerate(label, 1):
  49. direction = unicodedata.bidirectional(cp)
  50. if direction == '':
  51. # String likely comes from a newer version of Unicode
  52. raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
  53. if direction in ['R', 'AL', 'AN']:
  54. bidi_label = True
  55. if not bidi_label and not check_ltr:
  56. return True
  57. # Bidi rule 1
  58. direction = unicodedata.bidirectional(label[0])
  59. if direction in ['R', 'AL']:
  60. rtl = True
  61. elif direction == 'L':
  62. rtl = False
  63. else:
  64. raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))
  65. valid_ending = False
  66. number_type = False
  67. for (idx, cp) in enumerate(label, 1):
  68. direction = unicodedata.bidirectional(cp)
  69. if rtl:
  70. # Bidi rule 2
  71. if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  72. raise IDNABidiError('Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
  73. # Bidi rule 3
  74. if direction in ['R', 'AL', 'EN', 'AN']:
  75. valid_ending = True
  76. elif direction != 'NSM':
  77. valid_ending = False
  78. # Bidi rule 4
  79. if direction in ['AN', 'EN']:
  80. if not number_type:
  81. number_type = direction
  82. else:
  83. if number_type != direction:
  84. raise IDNABidiError('Can not mix numeral types in a right-to-left label')
  85. else:
  86. # Bidi rule 5
  87. if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
  88. raise IDNABidiError('Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
  89. # Bidi rule 6
  90. if direction in ['L', 'EN']:
  91. valid_ending = True
  92. elif direction != 'NSM':
  93. valid_ending = False
  94. if not valid_ending:
  95. raise IDNABidiError('Label ends with illegal codepoint directionality')
  96. return True
  97. def check_initial_combiner(label):
  98. if unicodedata.category(label[0])[0] == 'M':
  99. raise IDNAError('Label begins with an illegal combining character')
  100. return True
  101. def check_hyphen_ok(label):
  102. if label[2:4] == '--':
  103. raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
  104. if label[0] == '-' or label[-1] == '-':
  105. raise IDNAError('Label must not start or end with a hyphen')
  106. return True
  107. def check_nfc(label):
  108. if unicodedata.normalize('NFC', label) != label:
  109. raise IDNAError('Label must be in Normalization Form C')
  110. def valid_contextj(label, pos):
  111. cp_value = ord(label[pos])
  112. if cp_value == 0x200c:
  113. if pos > 0:
  114. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  115. return True
  116. ok = False
  117. for i in range(pos-1, -1, -1):
  118. joining_type = idnadata.joining_types.get(ord(label[i]))
  119. if joining_type == ord('T'):
  120. continue
  121. if joining_type in [ord('L'), ord('D')]:
  122. ok = True
  123. break
  124. if not ok:
  125. return False
  126. ok = False
  127. for i in range(pos+1, len(label)):
  128. joining_type = idnadata.joining_types.get(ord(label[i]))
  129. if joining_type == ord('T'):
  130. continue
  131. if joining_type in [ord('R'), ord('D')]:
  132. ok = True
  133. break
  134. return ok
  135. if cp_value == 0x200d:
  136. if pos > 0:
  137. if _combining_class(ord(label[pos - 1])) == _virama_combining_class:
  138. return True
  139. return False
  140. else:
  141. return False
  142. def valid_contexto(label, pos, exception=False):
  143. cp_value = ord(label[pos])
  144. if cp_value == 0x00b7:
  145. if 0 < pos < len(label)-1:
  146. if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:
  147. return True
  148. return False
  149. elif cp_value == 0x0375:
  150. if pos < len(label)-1 and len(label) > 1:
  151. return _is_script(label[pos + 1], 'Greek')
  152. return False
  153. elif cp_value == 0x05f3 or cp_value == 0x05f4:
  154. if pos > 0:
  155. return _is_script(label[pos - 1], 'Hebrew')
  156. return False
  157. elif cp_value == 0x30fb:
  158. for cp in label:
  159. if cp == u'\u30fb':
  160. continue
  161. if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):
  162. return True
  163. return False
  164. elif 0x660 <= cp_value <= 0x669:
  165. for cp in label:
  166. if 0x6f0 <= ord(cp) <= 0x06f9:
  167. return False
  168. return True
  169. elif 0x6f0 <= cp_value <= 0x6f9:
  170. for cp in label:
  171. if 0x660 <= ord(cp) <= 0x0669:
  172. return False
  173. return True
  174. def check_label(label):
  175. if isinstance(label, (bytes, bytearray)):
  176. label = label.decode('utf-8')
  177. if len(label) == 0:
  178. raise IDNAError('Empty Label')
  179. check_nfc(label)
  180. check_hyphen_ok(label)
  181. check_initial_combiner(label)
  182. for (pos, cp) in enumerate(label):
  183. cp_value = ord(cp)
  184. if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):
  185. continue
  186. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):
  187. try:
  188. if not valid_contextj(label, pos):
  189. raise InvalidCodepointContext('Joiner {0} not allowed at position {1} in {2}'.format(
  190. _unot(cp_value), pos+1, repr(label)))
  191. except ValueError:
  192. raise IDNAError('Unknown codepoint adjacent to joiner {0} at position {1} in {2}'.format(
  193. _unot(cp_value), pos+1, repr(label)))
  194. elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):
  195. if not valid_contexto(label, pos):
  196. raise InvalidCodepointContext('Codepoint {0} not allowed at position {1} in {2}'.format(_unot(cp_value), pos+1, repr(label)))
  197. else:
  198. raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
  199. check_bidi(label)
  200. def alabel(label):
  201. try:
  202. label = label.encode('ascii')
  203. try:
  204. ulabel(label)
  205. except IDNAError:
  206. raise IDNAError('The label {0} is not a valid A-label'.format(label))
  207. if not valid_label_length(label):
  208. raise IDNAError('Label too long')
  209. return label
  210. except UnicodeEncodeError:
  211. pass
  212. if not label:
  213. raise IDNAError('No Input')
  214. label = unicode(label)
  215. check_label(label)
  216. label = _punycode(label)
  217. label = _alabel_prefix + label
  218. if not valid_label_length(label):
  219. raise IDNAError('Label too long')
  220. return label
  221. def ulabel(label):
  222. if not isinstance(label, (bytes, bytearray)):
  223. try:
  224. label = label.encode('ascii')
  225. except UnicodeEncodeError:
  226. check_label(label)
  227. return label
  228. label = label.lower()
  229. if label.startswith(_alabel_prefix):
  230. label = label[len(_alabel_prefix):]
  231. else:
  232. check_label(label)
  233. return label.decode('ascii')
  234. label = label.decode('punycode')
  235. check_label(label)
  236. return label
  237. def uts46_remap(domain, std3_rules=True, transitional=False):
  238. """Re-map the characters in the string according to UTS46 processing."""
  239. from .uts46data import uts46data
  240. output = u""
  241. try:
  242. for pos, char in enumerate(domain):
  243. code_point = ord(char)
  244. uts46row = uts46data[code_point if code_point < 256 else
  245. bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
  246. status = uts46row[1]
  247. replacement = uts46row[2] if len(uts46row) == 3 else None
  248. if (status == "V" or
  249. (status == "D" and not transitional) or
  250. (status == "3" and not std3_rules and replacement is None)):
  251. output += char
  252. elif replacement is not None and (status == "M" or
  253. (status == "3" and not std3_rules) or
  254. (status == "D" and transitional)):
  255. output += replacement
  256. elif status != "I":
  257. raise IndexError()
  258. return unicodedata.normalize("NFC", output)
  259. except IndexError:
  260. raise InvalidCodepoint(
  261. "Codepoint {0} not allowed at position {1} in {2}".format(
  262. _unot(code_point), pos + 1, repr(domain)))
  263. def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
  264. if isinstance(s, (bytes, bytearray)):
  265. s = s.decode("ascii")
  266. if uts46:
  267. s = uts46_remap(s, std3_rules, transitional)
  268. trailing_dot = False
  269. result = []
  270. if strict:
  271. labels = s.split('.')
  272. else:
  273. labels = _unicode_dots_re.split(s)
  274. if not labels or labels == ['']:
  275. raise IDNAError('Empty domain')
  276. if labels[-1] == '':
  277. del labels[-1]
  278. trailing_dot = True
  279. for label in labels:
  280. s = alabel(label)
  281. if s:
  282. result.append(s)
  283. else:
  284. raise IDNAError('Empty label')
  285. if trailing_dot:
  286. result.append(b'')
  287. s = b'.'.join(result)
  288. if not valid_string_length(s, trailing_dot):
  289. raise IDNAError('Domain too long')
  290. return s
  291. def decode(s, strict=False, uts46=False, std3_rules=False):
  292. if isinstance(s, (bytes, bytearray)):
  293. s = s.decode("ascii")
  294. if uts46:
  295. s = uts46_remap(s, std3_rules, False)
  296. trailing_dot = False
  297. result = []
  298. if not strict:
  299. labels = _unicode_dots_re.split(s)
  300. else:
  301. labels = s.split(u'.')
  302. if not labels or labels == ['']:
  303. raise IDNAError('Empty domain')
  304. if not labels[-1]:
  305. del labels[-1]
  306. trailing_dot = True
  307. for label in labels:
  308. s = ulabel(label)
  309. if s:
  310. result.append(s)
  311. else:
  312. raise IDNAError('Empty label')
  313. if trailing_dot:
  314. result.append(u'')
  315. return u'.'.join(result)