You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

154 lines
4.4 KiB

4 years ago
  1. # coding: utf8
  2. from __future__ import unicode_literals
  3. IDS = {
  4. "": NULL_ATTR,
  5. "IS_ALPHA": IS_ALPHA,
  6. "IS_ASCII": IS_ASCII,
  7. "IS_DIGIT": IS_DIGIT,
  8. "IS_LOWER": IS_LOWER,
  9. "IS_PUNCT": IS_PUNCT,
  10. "IS_SPACE": IS_SPACE,
  11. "IS_TITLE": IS_TITLE,
  12. "IS_UPPER": IS_UPPER,
  13. "LIKE_URL": LIKE_URL,
  14. "LIKE_NUM": LIKE_NUM,
  15. "LIKE_EMAIL": LIKE_EMAIL,
  16. "IS_STOP": IS_STOP,
  17. "IS_OOV": IS_OOV,
  18. "IS_BRACKET": IS_BRACKET,
  19. "IS_QUOTE": IS_QUOTE,
  20. "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
  21. "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
  22. "IS_CURRENCY": IS_CURRENCY,
  23. "FLAG19": FLAG19,
  24. "FLAG20": FLAG20,
  25. "FLAG21": FLAG21,
  26. "FLAG22": FLAG22,
  27. "FLAG23": FLAG23,
  28. "FLAG24": FLAG24,
  29. "FLAG25": FLAG25,
  30. "FLAG26": FLAG26,
  31. "FLAG27": FLAG27,
  32. "FLAG28": FLAG28,
  33. "FLAG29": FLAG29,
  34. "FLAG30": FLAG30,
  35. "FLAG31": FLAG31,
  36. "FLAG32": FLAG32,
  37. "FLAG33": FLAG33,
  38. "FLAG34": FLAG34,
  39. "FLAG35": FLAG35,
  40. "FLAG36": FLAG36,
  41. "FLAG37": FLAG37,
  42. "FLAG38": FLAG38,
  43. "FLAG39": FLAG39,
  44. "FLAG40": FLAG40,
  45. "FLAG41": FLAG41,
  46. "FLAG42": FLAG42,
  47. "FLAG43": FLAG43,
  48. "FLAG44": FLAG44,
  49. "FLAG45": FLAG45,
  50. "FLAG46": FLAG46,
  51. "FLAG47": FLAG47,
  52. "FLAG48": FLAG48,
  53. "FLAG49": FLAG49,
  54. "FLAG50": FLAG50,
  55. "FLAG51": FLAG51,
  56. "FLAG52": FLAG52,
  57. "FLAG53": FLAG53,
  58. "FLAG54": FLAG54,
  59. "FLAG55": FLAG55,
  60. "FLAG56": FLAG56,
  61. "FLAG57": FLAG57,
  62. "FLAG58": FLAG58,
  63. "FLAG59": FLAG59,
  64. "FLAG60": FLAG60,
  65. "FLAG61": FLAG61,
  66. "FLAG62": FLAG62,
  67. "FLAG63": FLAG63,
  68. "ID": ID,
  69. "ORTH": ORTH,
  70. "LOWER": LOWER,
  71. "NORM": NORM,
  72. "SHAPE": SHAPE,
  73. "PREFIX": PREFIX,
  74. "SUFFIX": SUFFIX,
  75. "LENGTH": LENGTH,
  76. "CLUSTER": CLUSTER,
  77. "LEMMA": LEMMA,
  78. "POS": POS,
  79. "TAG": TAG,
  80. "DEP": DEP,
  81. "ENT_IOB": ENT_IOB,
  82. "ENT_TYPE": ENT_TYPE,
  83. "HEAD": HEAD,
  84. "SENT_START": SENT_START,
  85. "SPACY": SPACY,
  86. "PROB": PROB,
  87. "LANG": LANG,
  88. }
  89. # ATTR IDs, in order of the symbol
  90. NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
  91. locals().update(IDS)
  92. def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
  93. """
  94. Normalize a dictionary of attributes, converting them to ints.
  95. stringy_attrs (dict): Dictionary keyed by attribute string names. Values
  96. can be ints or strings.
  97. strings_map (StringStore): Defaults to None. If provided, encodes string
  98. values into ints.
  99. RETURNS (dict): Attributes dictionary with keys and optionally values
  100. converted to ints.
  101. """
  102. inty_attrs = {}
  103. if _do_deprecated:
  104. if 'F' in stringy_attrs:
  105. stringy_attrs["ORTH"] = stringy_attrs.pop("F")
  106. if 'L' in stringy_attrs:
  107. stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
  108. if 'pos' in stringy_attrs:
  109. stringy_attrs["TAG"] = stringy_attrs.pop("pos")
  110. if 'morph' in stringy_attrs:
  111. morphs = stringy_attrs.pop('morph')
  112. if 'number' in stringy_attrs:
  113. stringy_attrs.pop('number')
  114. if 'tenspect' in stringy_attrs:
  115. stringy_attrs.pop('tenspect')
  116. morph_keys = [
  117. 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
  118. 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
  119. 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
  120. 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
  121. 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
  122. 'NumValue', 'PartType', 'Polite', 'StyleVariant',
  123. 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
  124. 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
  125. 'Polarity', 'Animacy' # U20
  126. ]
  127. for key in morph_keys:
  128. if key in stringy_attrs:
  129. stringy_attrs.pop(key)
  130. elif key.lower() in stringy_attrs:
  131. stringy_attrs.pop(key.lower())
  132. elif key.upper() in stringy_attrs:
  133. stringy_attrs.pop(key.upper())
  134. for name, value in stringy_attrs.items():
  135. if isinstance(name, int):
  136. int_key = name
  137. else:
  138. int_key = IDS[name.upper()]
  139. if strings_map is not None and isinstance(value, basestring):
  140. if hasattr(strings_map, 'add'):
  141. value = strings_map.add(value)
  142. else:
  143. value = strings_map[value]
  144. inty_attrs[int_key] = value
  145. return inty_attrs