You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

315 lines
12 KiB

4 years ago
  1. # coding: utf8
  2. from __future__ import unicode_literals
  3. def explain(term):
  4. """Get a description for a given POS tag, dependency label or entity type.
  5. term (unicode): The term to explain.
  6. RETURNS (unicode): The explanation, or `None` if not found in the glossary.
  7. EXAMPLE:
  8. >>> spacy.explain(u'NORP')
  9. >>> doc = nlp(u'Hello world')
  10. >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
  11. """
  12. if term in GLOSSARY:
  13. return GLOSSARY[term]
  14. GLOSSARY = {
  15. # POS tags
  16. # Universal POS Tags
  17. # http://universaldependencies.org/u/pos/
  18. 'ADJ': 'adjective',
  19. 'ADP': 'adposition',
  20. 'ADV': 'adverb',
  21. 'AUX': 'auxiliary',
  22. 'CONJ': 'conjunction',
  23. 'CCONJ': 'coordinating conjunction',
  24. 'DET': 'determiner',
  25. 'INTJ': 'interjection',
  26. 'NOUN': 'noun',
  27. 'NUM': 'numeral',
  28. 'PART': 'particle',
  29. 'PRON': 'pronoun',
  30. 'PROPN': 'proper noun',
  31. 'PUNCT': 'punctuation',
  32. 'SCONJ': 'subordinating conjunction',
  33. 'SYM': 'symbol',
  34. 'VERB': 'verb',
  35. 'X': 'other',
  36. 'EOL': 'end of line',
  37. 'SPACE': 'space',
  38. # POS tags (English)
  39. # OntoNotes 5 / Penn Treebank
  40. # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
  41. '.': 'punctuation mark, sentence closer',
  42. ',': 'punctuation mark, comma',
  43. '-LRB-': 'left round bracket',
  44. '-RRB-': 'right round bracket',
  45. '``': 'opening quotation mark',
  46. '""': 'closing quotation mark',
  47. "''": 'closing quotation mark',
  48. ':': 'punctuation mark, colon or ellipsis',
  49. '$': 'symbol, currency',
  50. '#': 'symbol, number sign',
  51. 'AFX': 'affix',
  52. 'CC': 'conjunction, coordinating',
  53. 'CD': 'cardinal number',
  54. 'DT': 'determiner',
  55. 'EX': 'existential there',
  56. 'FW': 'foreign word',
  57. 'HYPH': 'punctuation mark, hyphen',
  58. 'IN': 'conjunction, subordinating or preposition',
  59. 'JJ': 'adjective',
  60. 'JJR': 'adjective, comparative',
  61. 'JJS': 'adjective, superlative',
  62. 'LS': 'list item marker',
  63. 'MD': 'verb, modal auxiliary',
  64. 'NIL': 'missing tag',
  65. 'NN': 'noun, singular or mass',
  66. 'NNP': 'noun, proper singular',
  67. 'NNPS': 'noun, proper plural',
  68. 'NNS': 'noun, plural',
  69. 'PDT': 'predeterminer',
  70. 'POS': 'possessive ending',
  71. 'PRP': 'pronoun, personal',
  72. 'PRP$': 'pronoun, possessive',
  73. 'RB': 'adverb',
  74. 'RBR': 'adverb, comparative',
  75. 'RBS': 'adverb, superlative',
  76. 'RP': 'adverb, particle',
  77. 'TO': 'infinitival to',
  78. 'UH': 'interjection',
  79. 'VB': 'verb, base form',
  80. 'VBD': 'verb, past tense',
  81. 'VBG': 'verb, gerund or present participle',
  82. 'VBN': 'verb, past participle',
  83. 'VBP': 'verb, non-3rd person singular present',
  84. 'VBZ': 'verb, 3rd person singular present',
  85. 'WDT': 'wh-determiner',
  86. 'WP': 'wh-pronoun, personal',
  87. 'WP$': 'wh-pronoun, possessive',
  88. 'WRB': 'wh-adverb',
  89. 'SP': 'space',
  90. 'ADD': 'email',
  91. 'NFP': 'superfluous punctuation',
  92. 'GW': 'additional word in multi-word expression',
  93. 'XX': 'unknown',
  94. 'BES': 'auxiliary "be"',
  95. 'HVS': 'forms of "have"',
  96. # POS Tags (German)
  97. # TIGER Treebank
  98. # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
  99. '$(': 'other sentence-internal punctuation mark',
  100. '$,': 'comma',
  101. '$.': 'sentence-final punctuation mark',
  102. 'ADJA': 'adjective, attributive',
  103. 'ADJD': 'adjective, adverbial or predicative',
  104. 'APPO': 'postposition',
  105. 'APPR': 'preposition; circumposition left',
  106. 'APPRART': 'preposition with article',
  107. 'APZR': 'circumposition right',
  108. 'ART': 'definite or indefinite article',
  109. 'CARD': 'cardinal number',
  110. 'FM': 'foreign language material',
  111. 'ITJ': 'interjection',
  112. 'KOKOM': 'comparative conjunction',
  113. 'KON': 'coordinate conjunction',
  114. 'KOUI': 'subordinate conjunction with "zu" and infinitive',
  115. 'KOUS': 'subordinate conjunction with sentence',
  116. 'NE': 'proper noun',
  117. 'NNE': 'proper noun',
  118. 'PAV': 'pronominal adverb',
  119. 'PROAV': 'pronominal adverb',
  120. 'PDAT': 'attributive demonstrative pronoun',
  121. 'PDS': 'substituting demonstrative pronoun',
  122. 'PIAT': 'attributive indefinite pronoun without determiner',
  123. 'PIDAT': 'attributive indefinite pronoun with determiner',
  124. 'PIS': 'substituting indefinite pronoun',
  125. 'PPER': 'non-reflexive personal pronoun',
  126. 'PPOSAT': 'attributive possessive pronoun',
  127. 'PPOSS': 'substituting possessive pronoun',
  128. 'PRELAT': 'attributive relative pronoun',
  129. 'PRELS': 'substituting relative pronoun',
  130. 'PRF': 'reflexive personal pronoun',
  131. 'PTKA': 'particle with adjective or adverb',
  132. 'PTKANT': 'answer particle',
  133. 'PTKNEG': 'negative particle',
  134. 'PTKVZ': 'separable verbal particle',
  135. 'PTKZU': '"zu" before infinitive',
  136. 'PWAT': 'attributive interrogative pronoun',
  137. 'PWAV': 'adverbial interrogative or relative pronoun',
  138. 'PWS': 'substituting interrogative pronoun',
  139. 'TRUNC': 'word remnant',
  140. 'VAFIN': 'finite verb, auxiliary',
  141. 'VAIMP': 'imperative, auxiliary',
  142. 'VAINF': 'infinitive, auxiliary',
  143. 'VAPP': 'perfect participle, auxiliary',
  144. 'VMFIN': 'finite verb, modal',
  145. 'VMINF': 'infinitive, modal',
  146. 'VMPP': 'perfect participle, modal',
  147. 'VVFIN': 'finite verb, full',
  148. 'VVIMP': 'imperative, full',
  149. 'VVINF': 'infinitive, full',
  150. 'VVIZU': 'infinitive with "zu", full',
  151. 'VVPP': 'perfect participle, full',
  152. 'XY': 'non-word containing non-letter',
  153. # Noun chunks
  154. 'NP': 'noun phrase',
  155. 'PP': 'prepositional phrase',
  156. 'VP': 'verb phrase',
  157. 'ADVP': 'adverb phrase',
  158. 'ADJP': 'adjective phrase',
  159. 'SBAR': 'subordinating conjunction',
  160. 'PRT': 'particle',
  161. 'PNP': 'prepositional noun phrase',
  162. # Dependency Labels (English)
  163. # ClearNLP / Universal Dependencies
  164. # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
  165. 'acomp': 'adjectival complement',
  166. 'advcl': 'adverbial clause modifier',
  167. 'advmod': 'adverbial modifier',
  168. 'agent': 'agent',
  169. 'amod': 'adjectival modifier',
  170. 'appos': 'appositional modifier',
  171. 'attr': 'attribute',
  172. 'aux': 'auxiliary',
  173. 'auxpass': 'auxiliary (passive)',
  174. 'cc': 'coordinating conjunction',
  175. 'ccomp': 'clausal complement',
  176. 'complm': 'complementizer',
  177. 'conj': 'conjunct',
  178. 'cop': 'copula',
  179. 'csubj': 'clausal subject',
  180. 'csubjpass': 'clausal subject (passive)',
  181. 'dep': 'unclassified dependent',
  182. 'det': 'determiner',
  183. 'dobj': 'direct object',
  184. 'expl': 'expletive',
  185. 'hmod': 'modifier in hyphenation',
  186. 'hyph': 'hyphen',
  187. 'infmod': 'infinitival modifier',
  188. 'intj': 'interjection',
  189. 'iobj': 'indirect object',
  190. 'mark': 'marker',
  191. 'meta': 'meta modifier',
  192. 'neg': 'negation modifier',
  193. 'nmod': 'modifier of nominal',
  194. 'nn': 'noun compound modifier',
  195. 'npadvmod': 'noun phrase as adverbial modifier',
  196. 'nsubj': 'nominal subject',
  197. 'nsubjpass': 'nominal subject (passive)',
  198. 'num': 'number modifier',
  199. 'number': 'number compound modifier',
  200. 'oprd': 'object predicate',
  201. 'obj': 'object',
  202. 'obl': 'oblique nominal',
  203. 'parataxis': 'parataxis',
  204. 'partmod': 'participal modifier',
  205. 'pcomp': 'complement of preposition',
  206. 'pobj': 'object of preposition',
  207. 'poss': 'possession modifier',
  208. 'possessive': 'possessive modifier',
  209. 'preconj': 'pre-correlative conjunction',
  210. 'prep': 'prepositional modifier',
  211. 'prt': 'particle',
  212. 'punct': 'punctuation',
  213. 'quantmod': 'modifier of quantifier',
  214. 'rcmod': 'relative clause modifier',
  215. 'root': 'root',
  216. 'xcomp': 'open clausal complement',
  217. # Dependency labels (German)
  218. # TIGER Treebank
  219. # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
  220. # currently missing: 'cc' (comparative complement) because of conflict
  221. # with English labels
  222. 'ac': 'adpositional case marker',
  223. 'adc': 'adjective component',
  224. 'ag': 'genitive attribute',
  225. 'ams': 'measure argument of adjective',
  226. 'app': 'apposition',
  227. 'avc': 'adverbial phrase component',
  228. 'cd': 'coordinating conjunction',
  229. 'cj': 'conjunct',
  230. 'cm': 'comparative conjunction',
  231. 'cp': 'complementizer',
  232. 'cvc': 'collocational verb construction',
  233. 'da': 'dative',
  234. 'dh': 'discourse-level head',
  235. 'dm': 'discourse marker',
  236. 'ep': 'expletive es',
  237. 'hd': 'head',
  238. 'ju': 'junctor',
  239. 'mnr': 'postnominal modifier',
  240. 'mo': 'modifier',
  241. 'ng': 'negation',
  242. 'nk': 'noun kernel element',
  243. 'nmc': 'numerical component',
  244. 'oa': 'accusative object',
  245. 'oc': 'clausal object',
  246. 'og': 'genitive object',
  247. 'op': 'prepositional object',
  248. 'par': 'parenthetical element',
  249. 'pd': 'predicate',
  250. 'pg': 'phrasal genitive',
  251. 'ph': 'placeholder',
  252. 'pm': 'morphological particle',
  253. 'pnc': 'proper noun component',
  254. 'rc': 'relative clause',
  255. 're': 'repeated element',
  256. 'rs': 'reported speech',
  257. 'sb': 'subject',
  258. # Named Entity Recognition
  259. # OntoNotes 5
  260. # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
  261. 'PERSON': 'People, including fictional',
  262. 'NORP': 'Nationalities or religious or political groups',
  263. 'FACILITY': 'Buildings, airports, highways, bridges, etc.',
  264. 'FAC': 'Buildings, airports, highways, bridges, etc.',
  265. 'ORG': 'Companies, agencies, institutions, etc.',
  266. 'GPE': 'Countries, cities, states',
  267. 'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
  268. 'PRODUCT': 'Objects, vehicles, foods, etc. (not services)',
  269. 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
  270. 'WORK_OF_ART': 'Titles of books, songs, etc.',
  271. 'LAW': 'Named documents made into laws.',
  272. 'LANGUAGE': 'Any named language',
  273. 'DATE': 'Absolute or relative dates or periods',
  274. 'TIME': 'Times smaller than a day',
  275. 'PERCENT': 'Percentage, including "%"',
  276. 'MONEY': 'Monetary values, including unit',
  277. 'QUANTITY': 'Measurements, as of weight or distance',
  278. 'ORDINAL': '"first", "second", etc.',
  279. 'CARDINAL': 'Numerals that do not fall under another type',
  280. # Named Entity Recognition
  281. # Wikipedia
  282. # http://www.sciencedirect.com/science/article/pii/S0004370212000276
  283. # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
  284. 'PER': 'Named person or family.',
  285. 'MISC': ('Miscellaneous entities, e.g. events, nationalities, '
  286. 'products or works of art'),
  287. }