You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

478 lines
14 KiB

4 years ago
  1. # coding: utf8
  2. #cython: optimize.unpack_method_calls=False
  3. from __future__ import unicode_literals
  4. IDS = {
  5. "": NIL,
  6. "IS_ALPHA": IS_ALPHA,
  7. "IS_ASCII": IS_ASCII,
  8. "IS_DIGIT": IS_DIGIT,
  9. "IS_LOWER": IS_LOWER,
  10. "IS_PUNCT": IS_PUNCT,
  11. "IS_SPACE": IS_SPACE,
  12. "IS_TITLE": IS_TITLE,
  13. "IS_UPPER": IS_UPPER,
  14. "LIKE_URL": LIKE_URL,
  15. "LIKE_NUM": LIKE_NUM,
  16. "LIKE_EMAIL": LIKE_EMAIL,
  17. "IS_STOP": IS_STOP,
  18. "IS_OOV": IS_OOV,
  19. "IS_BRACKET": IS_BRACKET,
  20. "IS_QUOTE": IS_QUOTE,
  21. "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
  22. "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
  23. "IS_CURRENCY": IS_CURRENCY,
  24. "FLAG19": FLAG19,
  25. "FLAG20": FLAG20,
  26. "FLAG21": FLAG21,
  27. "FLAG22": FLAG22,
  28. "FLAG23": FLAG23,
  29. "FLAG24": FLAG24,
  30. "FLAG25": FLAG25,
  31. "FLAG26": FLAG26,
  32. "FLAG27": FLAG27,
  33. "FLAG28": FLAG28,
  34. "FLAG29": FLAG29,
  35. "FLAG30": FLAG30,
  36. "FLAG31": FLAG31,
  37. "FLAG32": FLAG32,
  38. "FLAG33": FLAG33,
  39. "FLAG34": FLAG34,
  40. "FLAG35": FLAG35,
  41. "FLAG36": FLAG36,
  42. "FLAG37": FLAG37,
  43. "FLAG38": FLAG38,
  44. "FLAG39": FLAG39,
  45. "FLAG40": FLAG40,
  46. "FLAG41": FLAG41,
  47. "FLAG42": FLAG42,
  48. "FLAG43": FLAG43,
  49. "FLAG44": FLAG44,
  50. "FLAG45": FLAG45,
  51. "FLAG46": FLAG46,
  52. "FLAG47": FLAG47,
  53. "FLAG48": FLAG48,
  54. "FLAG49": FLAG49,
  55. "FLAG50": FLAG50,
  56. "FLAG51": FLAG51,
  57. "FLAG52": FLAG52,
  58. "FLAG53": FLAG53,
  59. "FLAG54": FLAG54,
  60. "FLAG55": FLAG55,
  61. "FLAG56": FLAG56,
  62. "FLAG57": FLAG57,
  63. "FLAG58": FLAG58,
  64. "FLAG59": FLAG59,
  65. "FLAG60": FLAG60,
  66. "FLAG61": FLAG61,
  67. "FLAG62": FLAG62,
  68. "FLAG63": FLAG63,
  69. "ID": ID,
  70. "ORTH": ORTH,
  71. "LOWER": LOWER,
  72. "NORM": NORM,
  73. "SHAPE": SHAPE,
  74. "PREFIX": PREFIX,
  75. "SUFFIX": SUFFIX,
  76. "LENGTH": LENGTH,
  77. "CLUSTER": CLUSTER,
  78. "LEMMA": LEMMA,
  79. "POS": POS,
  80. "TAG": TAG,
  81. "DEP": DEP,
  82. "ENT_IOB": ENT_IOB,
  83. "ENT_TYPE": ENT_TYPE,
  84. "HEAD": HEAD,
  85. "SENT_START": SENT_START,
  86. "SPACY": SPACY,
  87. "PROB": PROB,
  88. "LANG": LANG,
  89. "ADJ": ADJ,
  90. "ADP": ADP,
  91. "ADV": ADV,
  92. "AUX": AUX,
  93. "CONJ": CONJ,
  94. "CCONJ": CCONJ, # U20
  95. "DET": DET,
  96. "INTJ": INTJ,
  97. "NOUN": NOUN,
  98. "NUM": NUM,
  99. "PART": PART,
  100. "PRON": PRON,
  101. "PROPN": PROPN,
  102. "PUNCT": PUNCT,
  103. "SCONJ": SCONJ,
  104. "SYM": SYM,
  105. "VERB": VERB,
  106. "X": X,
  107. "EOL": EOL,
  108. "SPACE": SPACE,
  109. "Animacy_anim": Animacy_anim,
  110. "Animacy_inam": Animacy_inam,
  111. "Animacy_hum": Animacy_hum, # U20
  112. "Aspect_freq": Aspect_freq,
  113. "Aspect_imp": Aspect_imp,
  114. "Aspect_mod": Aspect_mod,
  115. "Aspect_none": Aspect_none,
  116. "Aspect_perf": Aspect_perf,
  117. "Aspect_iter": Aspect_iter, # U20
  118. "Aspect_hab": Aspect_hab, # U20
  119. "Case_abe": Case_abe,
  120. "Case_abl": Case_abl,
  121. "Case_abs": Case_abs,
  122. "Case_acc": Case_acc,
  123. "Case_ade": Case_ade,
  124. "Case_all": Case_all,
  125. "Case_cau": Case_cau,
  126. "Case_com": Case_com,
  127. "Case_cmp": Case_cmp, # U20
  128. "Case_dat": Case_dat,
  129. "Case_del": Case_del,
  130. "Case_dis": Case_dis,
  131. "Case_ela": Case_ela,
  132. "Case_equ": Case_equ, # U20
  133. "Case_ess": Case_ess,
  134. "Case_gen": Case_gen,
  135. "Case_ill": Case_ill,
  136. "Case_ine": Case_ine,
  137. "Case_ins": Case_ins,
  138. "Case_loc": Case_loc,
  139. "Case_lat": Case_lat,
  140. "Case_nom": Case_nom,
  141. "Case_par": Case_par,
  142. "Case_sub": Case_sub,
  143. "Case_sup": Case_sup,
  144. "Case_tem": Case_tem,
  145. "Case_ter": Case_ter,
  146. "Case_tra": Case_tra,
  147. "Case_voc": Case_voc,
  148. "Definite_two": Definite_two,
  149. "Definite_def": Definite_def,
  150. "Definite_red": Definite_red,
  151. "Definite_cons": Definite_cons, # U20
  152. "Definite_ind": Definite_ind,
  153. "Definite_spec": Definite_spec, # U20
  154. "Degree_cmp": Degree_cmp,
  155. "Degree_comp": Degree_comp,
  156. "Degree_none": Degree_none,
  157. "Degree_pos": Degree_pos,
  158. "Degree_sup": Degree_sup,
  159. "Degree_abs": Degree_abs,
  160. "Degree_com": Degree_com,
  161. "Degree_dim": Degree_dim, # du
  162. "Degree_equ": Degree_equ, # U20
  163. "Evident_nfh": Evident_nfh, # U20
  164. "Gender_com": Gender_com,
  165. "Gender_fem": Gender_fem,
  166. "Gender_masc": Gender_masc,
  167. "Gender_neut": Gender_neut,
  168. "Mood_cnd": Mood_cnd,
  169. "Mood_imp": Mood_imp,
  170. "Mood_ind": Mood_ind,
  171. "Mood_n": Mood_n,
  172. "Mood_pot": Mood_pot,
  173. "Mood_sub": Mood_sub,
  174. "Mood_opt": Mood_opt,
  175. "Mood_prp": Mood_prp, # U20
  176. "Mood_adm": Mood_adm, # U20
  177. "Negative_neg": Negative_neg,
  178. "Negative_pos": Negative_pos,
  179. "Negative_yes": Negative_yes,
  180. "Polarity_neg": Polarity_neg, # U20
  181. "Polarity_pos": Polarity_pos, # U20
  182. "Number_com": Number_com,
  183. "Number_dual": Number_dual,
  184. "Number_none": Number_none,
  185. "Number_plur": Number_plur,
  186. "Number_sing": Number_sing,
  187. "Number_ptan": Number_ptan, # bg
  188. "Number_count": Number_count, # bg, U20
  189. "Number_tri": Number_tri, # U20
  190. "NumType_card": NumType_card,
  191. "NumType_dist": NumType_dist,
  192. "NumType_frac": NumType_frac,
  193. "NumType_gen": NumType_gen,
  194. "NumType_mult": NumType_mult,
  195. "NumType_none": NumType_none,
  196. "NumType_ord": NumType_ord,
  197. "NumType_sets": NumType_sets,
  198. "Person_one": Person_one,
  199. "Person_two": Person_two,
  200. "Person_three": Person_three,
  201. "Person_none": Person_none,
  202. "Poss_yes": Poss_yes,
  203. "PronType_advPart": PronType_advPart,
  204. "PronType_art": PronType_art,
  205. "PronType_default": PronType_default,
  206. "PronType_dem": PronType_dem,
  207. "PronType_ind": PronType_ind,
  208. "PronType_int": PronType_int,
  209. "PronType_neg": PronType_neg,
  210. "PronType_prs": PronType_prs,
  211. "PronType_rcp": PronType_rcp,
  212. "PronType_rel": PronType_rel,
  213. "PronType_tot": PronType_tot,
  214. "PronType_clit": PronType_clit,
  215. "PronType_exc": PronType_exc, # es, ca, it, fa, U20
  216. "PronType_emp": PronType_emp, # U20
  217. "Reflex_yes": Reflex_yes,
  218. "Tense_fut": Tense_fut,
  219. "Tense_imp": Tense_imp,
  220. "Tense_past": Tense_past,
  221. "Tense_pres": Tense_pres,
  222. "VerbForm_fin": VerbForm_fin,
  223. "VerbForm_ger": VerbForm_ger,
  224. "VerbForm_inf": VerbForm_inf,
  225. "VerbForm_none": VerbForm_none,
  226. "VerbForm_part": VerbForm_part,
  227. "VerbForm_partFut": VerbForm_partFut,
  228. "VerbForm_partPast": VerbForm_partPast,
  229. "VerbForm_partPres": VerbForm_partPres,
  230. "VerbForm_sup": VerbForm_sup,
  231. "VerbForm_trans": VerbForm_trans,
  232. "VerbForm_conv": VerbForm_conv, # U20
  233. "VerbForm_gdv": VerbForm_gdv, # la,
  234. "VerbForm_vnoun": VerbForm_vnoun, # U20
  235. "Voice_act": Voice_act,
  236. "Voice_cau": Voice_cau,
  237. "Voice_pass": Voice_pass,
  238. "Voice_mid": Voice_mid, # gkc, U20
  239. "Voice_int": Voice_int, # hb,
  240. "Voice_antip": Voice_antip, # U20
  241. "Voice_dir": Voice_dir, # U20
  242. "Voice_inv": Voice_inv, # U20
  243. "Abbr_yes": Abbr_yes, # cz, fi, sl, U,
  244. "AdpType_prep": AdpType_prep, # cz, U,
  245. "AdpType_post": AdpType_post, # U,
  246. "AdpType_voc": AdpType_voc, # cz,
  247. "AdpType_comprep": AdpType_comprep, # cz,
  248. "AdpType_circ": AdpType_circ, # U,
  249. "AdvType_man": AdvType_man,
  250. "AdvType_loc": AdvType_loc,
  251. "AdvType_tim": AdvType_tim,
  252. "AdvType_deg": AdvType_deg,
  253. "AdvType_cau": AdvType_cau,
  254. "AdvType_mod": AdvType_mod,
  255. "AdvType_sta": AdvType_sta,
  256. "AdvType_ex": AdvType_ex,
  257. "AdvType_adadj": AdvType_adadj,
  258. "ConjType_oper": ConjType_oper, # cz, U,
  259. "ConjType_comp": ConjType_comp, # cz, U,
  260. "Connegative_yes": Connegative_yes, # fi,
  261. "Derivation_minen": Derivation_minen, # fi,
  262. "Derivation_sti": Derivation_sti, # fi,
  263. "Derivation_inen": Derivation_inen, # fi,
  264. "Derivation_lainen": Derivation_lainen, # fi,
  265. "Derivation_ja": Derivation_ja, # fi,
  266. "Derivation_ton": Derivation_ton, # fi,
  267. "Derivation_vs": Derivation_vs, # fi,
  268. "Derivation_ttain": Derivation_ttain, # fi,
  269. "Derivation_ttaa": Derivation_ttaa, # fi,
  270. "Echo_rdp": Echo_rdp, # U,
  271. "Echo_ech": Echo_ech, # U,
  272. "Foreign_foreign": Foreign_foreign, # cz, fi, U,
  273. "Foreign_fscript": Foreign_fscript, # cz, fi, U,
  274. "Foreign_tscript": Foreign_tscript, # cz, U,
  275. "Foreign_yes": Foreign_yes, # sl,
  276. "Gender_dat_masc": Gender_dat_masc, # bq, U,
  277. "Gender_dat_fem": Gender_dat_fem, # bq, U,
  278. "Gender_erg_masc": Gender_erg_masc, # bq,
  279. "Gender_erg_fem": Gender_erg_fem, # bq,
  280. "Gender_psor_masc": Gender_psor_masc, # cz, sl, U,
  281. "Gender_psor_fem": Gender_psor_fem, # cz, sl, U,
  282. "Gender_psor_neut": Gender_psor_neut, # sl,
  283. "Hyph_yes": Hyph_yes, # cz, U,
  284. "InfForm_one": InfForm_one, # fi,
  285. "InfForm_two": InfForm_two, # fi,
  286. "InfForm_three": InfForm_three, # fi,
  287. "NameType_geo": NameType_geo, # U, cz,
  288. "NameType_prs": NameType_prs, # U, cz,
  289. "NameType_giv": NameType_giv, # U, cz,
  290. "NameType_sur": NameType_sur, # U, cz,
  291. "NameType_nat": NameType_nat, # U, cz,
  292. "NameType_com": NameType_com, # U, cz,
  293. "NameType_pro": NameType_pro, # U, cz,
  294. "NameType_oth": NameType_oth, # U, cz,
  295. "NounType_com": NounType_com, # U,
  296. "NounType_prop": NounType_prop, # U,
  297. "NounType_class": NounType_class, # U,
  298. "Number_abs_sing": Number_abs_sing, # bq, U,
  299. "Number_abs_plur": Number_abs_plur, # bq, U,
  300. "Number_dat_sing": Number_dat_sing, # bq, U,
  301. "Number_dat_plur": Number_dat_plur, # bq, U,
  302. "Number_erg_sing": Number_erg_sing, # bq, U,
  303. "Number_erg_plur": Number_erg_plur, # bq, U,
  304. "Number_psee_sing": Number_psee_sing, # U,
  305. "Number_psee_plur": Number_psee_plur, # U,
  306. "Number_psor_sing": Number_psor_sing, # cz, fi, sl, U,
  307. "Number_psor_plur": Number_psor_plur, # cz, fi, sl, U,
  308. "Number_pauc": Number_pauc, # U20
  309. "Number_grpa": Number_grpa, # U20
  310. "Number_grpl": Number_grpl, # U20
  311. "Number_inv": Number_inv, # U20
  312. "NumForm_digit": NumForm_digit, # cz, sl, U,
  313. "NumForm_roman": NumForm_roman, # cz, sl, U,
  314. "NumForm_word": NumForm_word, # cz, sl, U,
  315. "NumValue_one": NumValue_one, # cz, U,
  316. "NumValue_two": NumValue_two, # cz, U,
  317. "NumValue_three": NumValue_three, # cz, U,
  318. "PartForm_pres": PartForm_pres, # fi,
  319. "PartForm_past": PartForm_past, # fi,
  320. "PartForm_agt": PartForm_agt, # fi,
  321. "PartForm_neg": PartForm_neg, # fi,
  322. "PartType_mod": PartType_mod, # U,
  323. "PartType_emp": PartType_emp, # U,
  324. "PartType_res": PartType_res, # U,
  325. "PartType_inf": PartType_inf, # U,
  326. "PartType_vbp": PartType_vbp, # U,
  327. "Person_abs_one": Person_abs_one, # bq, U,
  328. "Person_abs_two": Person_abs_two, # bq, U,
  329. "Person_abs_three": Person_abs_three, # bq, U,
  330. "Person_dat_one": Person_dat_one, # bq, U,
  331. "Person_dat_two": Person_dat_two, # bq, U,
  332. "Person_dat_three": Person_dat_three, # bq, U,
  333. "Person_erg_one": Person_erg_one, # bq, U,
  334. "Person_erg_two": Person_erg_two, # bq, U,
  335. "Person_erg_three": Person_erg_three, # bq, U,
  336. "Person_psor_one": Person_psor_one, # fi, U,
  337. "Person_psor_two": Person_psor_two, # fi, U,
  338. "Person_psor_three": Person_psor_three, # fi, U,
  339. "Person_zero": Person_zero, # U20
  340. "Person_four": Person_four, # U20
  341. "Polite_inf": Polite_inf, # bq, U,
  342. "Polite_pol": Polite_pol, # bq, U,
  343. "Polite_abs_inf": Polite_abs_inf, # bq, U,
  344. "Polite_abs_pol": Polite_abs_pol, # bq, U,
  345. "Polite_erg_inf": Polite_erg_inf, # bq, U,
  346. "Polite_erg_pol": Polite_erg_pol, # bq, U,
  347. "Polite_dat_inf": Polite_dat_inf, # bq, U,
  348. "Polite_dat_pol": Polite_dat_pol, # bq, U,
  349. "Polite_infm": Polite_infm, # U20
  350. "Polite_form": Polite_form, # U20
  351. "Polite_form_elev": Polite_form_elev, # U20
  352. "Polite_form_humb": Polite_form_humb, # U20
  353. "Prefix_yes": Prefix_yes, # U,
  354. "PrepCase_npr": PrepCase_npr, # cz,
  355. "PrepCase_pre": PrepCase_pre, # U,
  356. "PunctSide_ini": PunctSide_ini, # U,
  357. "PunctSide_fin": PunctSide_fin, # U,
  358. "PunctType_peri": PunctType_peri, # U,
  359. "PunctType_qest": PunctType_qest, # U,
  360. "PunctType_excl": PunctType_excl, # U,
  361. "PunctType_quot": PunctType_quot, # U,
  362. "PunctType_brck": PunctType_brck, # U,
  363. "PunctType_comm": PunctType_comm, # U,
  364. "PunctType_colo": PunctType_colo, # U,
  365. "PunctType_semi": PunctType_semi, # U,
  366. "PunctType_dash": PunctType_dash, # U,
  367. "Style_arch": Style_arch, # cz, fi, U,
  368. "Style_rare": Style_rare, # cz, fi, U,
  369. "Style_poet": Style_poet, # cz, U,
  370. "Style_norm": Style_norm, # cz, U,
  371. "Style_coll": Style_coll, # cz, U,
  372. "Style_vrnc": Style_vrnc, # cz, U,
  373. "Style_sing": Style_sing, # cz, U,
  374. "Style_expr": Style_expr, # cz, U,
  375. "Style_derg": Style_derg, # cz, U,
  376. "Style_vulg": Style_vulg, # cz, U,
  377. "Style_yes": Style_yes, # fi, U,
  378. "StyleVariant_styleShort": StyleVariant_styleShort, # cz,
  379. "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl,
  380. "VerbType_aux": VerbType_aux, # U,
  381. "VerbType_cop": VerbType_cop, # U,
  382. "VerbType_mod": VerbType_mod, # U,
  383. "VerbType_light": VerbType_light, # U,
  384. "PERSON": PERSON,
  385. "NORP": NORP,
  386. "FACILITY": FACILITY,
  387. "ORG": ORG,
  388. "GPE": GPE,
  389. "LOC": LOC,
  390. "PRODUCT": PRODUCT,
  391. "EVENT": EVENT,
  392. "WORK_OF_ART": WORK_OF_ART,
  393. "LANGUAGE": LANGUAGE,
  394. "DATE": DATE,
  395. "TIME": TIME,
  396. "PERCENT": PERCENT,
  397. "MONEY": MONEY,
  398. "QUANTITY": QUANTITY,
  399. "ORDINAL": ORDINAL,
  400. "CARDINAL": CARDINAL,
  401. "acomp": acomp,
  402. "advcl": advcl,
  403. "advmod": advmod,
  404. "agent": agent,
  405. "amod": amod,
  406. "appos": appos,
  407. "attr": attr,
  408. "aux": aux,
  409. "auxpass": auxpass,
  410. "cc": cc,
  411. "ccomp": ccomp,
  412. "complm": complm,
  413. "conj": conj,
  414. "cop": cop, # U20
  415. "csubj": csubj,
  416. "csubjpass": csubjpass,
  417. "dep": dep,
  418. "det": det,
  419. "dobj": dobj,
  420. "expl": expl,
  421. "hmod": hmod,
  422. "hyph": hyph,
  423. "infmod": infmod,
  424. "intj": intj,
  425. "iobj": iobj,
  426. "mark": mark,
  427. "meta": meta,
  428. "neg": neg,
  429. "nmod": nmod,
  430. "nn": nn,
  431. "npadvmod": npadvmod,
  432. "nsubj": nsubj,
  433. "nsubjpass": nsubjpass,
  434. "num": num,
  435. "number": number,
  436. "oprd": oprd,
  437. "obj": obj, # U20
  438. "obl": obl, # U20
  439. "parataxis": parataxis,
  440. "partmod": partmod,
  441. "pcomp": pcomp,
  442. "pobj": pobj,
  443. "poss": poss,
  444. "possessive": possessive,
  445. "preconj": preconj,
  446. "prep": prep,
  447. "prt": prt,
  448. "punct": punct,
  449. "quantmod": quantmod,
  450. "rcmod": rcmod,
  451. "root": root,
  452. "xcomp": xcomp,
  453. "acl": acl,
  454. "LAW": LAW,
  455. }
  456. def sort_nums(x):
  457. return x[1]
  458. PRON_LEMMA = "-PRON-"
  459. NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
  460. # Unfortunate hack here, to work around problem with long cpdef enum
  461. # (which is generating an enormous amount of C++ in Cython 0.24+)
  462. # We keep the enum cdef, and just make sure the names are available to Python
  463. locals().update(IDS)