You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

512 lines
17 KiB

4 years ago
  1. # cython: embedsignature=True
  2. # coding: utf8
  3. from __future__ import unicode_literals, print_function
  4. # Compiler crashes on memory view coercion without this. Should report bug.
  5. from cython.view cimport array as cvarray
  6. cimport numpy as np
  7. np.import_array()
  8. from libc.string cimport memset
  9. import numpy
  10. from .typedefs cimport attr_t, flags_t
  11. from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
  12. from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
  13. from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
  14. from .attrs cimport PROB
  15. from .attrs import intify_attrs
  16. from .errors import Errors
  17. memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
  18. cdef class Lexeme:
  19. """An entry in the vocabulary. A `Lexeme` has no string context – it's a
  20. word-type, as opposed to a word token. It therefore has no part-of-speech
  21. tag, dependency parse, or lemma (lemmatization depends on the
  22. part-of-speech tag).
  23. """
  24. def __init__(self, Vocab vocab, attr_t orth):
  25. """Create a Lexeme object.
  26. vocab (Vocab): The parent vocabulary
  27. orth (uint64): The orth id of the lexeme.
  28. Returns (Lexeme): The newly constructd object.
  29. """
  30. self.vocab = vocab
  31. self.orth = orth
  32. self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
  33. if self.c.orth != orth:
  34. raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
  35. def __richcmp__(self, other, int op):
  36. if other is None:
  37. if op == 0 or op == 1 or op == 2:
  38. return False
  39. else:
  40. return True
  41. if isinstance(other, Lexeme):
  42. a = self.orth
  43. b = other.orth
  44. elif isinstance(other, long):
  45. a = self.orth
  46. b = other
  47. elif isinstance(other, str):
  48. a = self.orth_
  49. b = other
  50. else:
  51. a = 0
  52. b = 1
  53. if op == 2: # ==
  54. return a == b
  55. elif op == 3: # !=
  56. return a != b
  57. elif op == 0: # <
  58. return a < b
  59. elif op == 1: # <=
  60. return a <= b
  61. elif op == 4: # >
  62. return a > b
  63. elif op == 5: # >=
  64. return a >= b
  65. else:
  66. raise NotImplementedError(op)
  67. def __hash__(self):
  68. return self.c.orth
  69. def set_attrs(self, **attrs):
  70. cdef attr_id_t attr
  71. attrs = intify_attrs(attrs)
  72. for attr, value in attrs.items():
  73. if attr == PROB:
  74. self.c.prob = value
  75. elif attr == CLUSTER:
  76. self.c.cluster = int(value)
  77. elif isinstance(value, int) or isinstance(value, long):
  78. Lexeme.set_struct_attr(self.c, attr, value)
  79. else:
  80. Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
  81. def set_flag(self, attr_id_t flag_id, bint value):
  82. """Change the value of a boolean flag.
  83. flag_id (int): The attribute ID of the flag to set.
  84. value (bool): The new value of the flag.
  85. """
  86. Lexeme.c_set_flag(self.c, flag_id, value)
  87. def check_flag(self, attr_id_t flag_id):
  88. """Check the value of a boolean flag.
  89. flag_id (int): The attribute ID of the flag to query.
  90. RETURNS (bool): The value of the flag.
  91. """
  92. return True if Lexeme.c_check_flag(self.c, flag_id) else False
  93. def similarity(self, other):
  94. """Compute a semantic similarity estimate. Defaults to cosine over
  95. vectors.
  96. other (object): The object to compare with. By default, accepts `Doc`,
  97. `Span`, `Token` and `Lexeme` objects.
  98. RETURNS (float): A scalar similarity score. Higher is more similar.
  99. """
  100. # Return 1.0 similarity for matches
  101. if hasattr(other, 'orth'):
  102. if self.c.orth == other.orth:
  103. return 1.0
  104. elif hasattr(other, '__len__') and len(other) == 1 \
  105. and hasattr(other[0], 'orth'):
  106. if self.c.orth == other[0].orth:
  107. return 1.0
  108. if self.vector_norm == 0 or other.vector_norm == 0:
  109. return 0.0
  110. return (numpy.dot(self.vector, other.vector) /
  111. (self.vector_norm * other.vector_norm))
  112. def to_bytes(self):
  113. lex_data = Lexeme.c_to_bytes(self.c)
  114. start = <const char*>&self.c.flags
  115. end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
  116. if (end-start) != sizeof(lex_data.data):
  117. raise ValueError(Errors.E072.format(length=end-start,
  118. bad_length=sizeof(lex_data.data)))
  119. byte_string = b'\0' * sizeof(lex_data.data)
  120. byte_chars = <char*>byte_string
  121. for i in range(sizeof(lex_data.data)):
  122. byte_chars[i] = lex_data.data[i]
  123. if len(byte_string) != sizeof(lex_data.data):
  124. raise ValueError(Errors.E072.format(length=len(byte_string),
  125. bad_length=sizeof(lex_data.data)))
  126. return byte_string
  127. def from_bytes(self, bytes byte_string):
  128. # This method doesn't really have a use-case --- wrote it for testing.
  129. # Possibly delete? It puts the Lexeme out of synch with the vocab.
  130. cdef SerializedLexemeC lex_data
  131. if len(byte_string) != sizeof(lex_data.data):
  132. raise ValueError(Errors.E072.format(length=len(byte_string),
  133. bad_length=sizeof(lex_data.data)))
  134. for i in range(len(byte_string)):
  135. lex_data.data[i] = byte_string[i]
  136. Lexeme.c_from_bytes(self.c, lex_data)
  137. self.orth = self.c.orth
  138. property has_vector:
  139. """RETURNS (bool): Whether a word vector is associated with the object.
  140. """
  141. def __get__(self):
  142. return self.vocab.has_vector(self.c.orth)
  143. property vector_norm:
  144. """RETURNS (float): The L2 norm of the vector representation."""
  145. def __get__(self):
  146. vector = self.vector
  147. return numpy.sqrt((vector**2).sum())
  148. property vector:
  149. """A real-valued meaning representation.
  150. RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
  151. representing the lexeme's semantics.
  152. """
  153. def __get__(self):
  154. cdef int length = self.vocab.vectors_length
  155. if length == 0:
  156. raise ValueError(Errors.E010)
  157. return self.vocab.get_vector(self.c.orth)
  158. def __set__(self, vector):
  159. if len(vector) != self.vocab.vectors_length:
  160. raise ValueError(Errors.E073.format(new_length=len(vector),
  161. length=self.vocab.vectors_length))
  162. self.vocab.set_vector(self.c.orth, vector)
  163. property rank:
  164. """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
  165. to index into tables, e.g. for word vectors."""
  166. def __get__(self):
  167. return self.c.id
  168. def __set__(self, value):
  169. self.c.id = value
  170. property sentiment:
  171. """RETURNS (float): A scalar value indicating the positivity or
  172. negativity of the lexeme."""
  173. def __get__(self):
  174. return self.c.sentiment
  175. def __set__(self, float sentiment):
  176. self.c.sentiment = sentiment
  177. property orth_:
  178. """RETURNS (unicode): The original verbatim text of the lexeme
  179. (identical to `Lexeme.text`). Exists mostly for consistency with
  180. the other attributes."""
  181. def __get__(self):
  182. return self.vocab.strings[self.c.orth]
  183. property text:
  184. """RETURNS (unicode): The original verbatim text of the lexeme."""
  185. def __get__(self):
  186. return self.orth_
  187. property lower:
  188. """RETURNS (unicode): Lowercase form of the lexeme."""
  189. def __get__(self):
  190. return self.c.lower
  191. def __set__(self, attr_t x):
  192. self.c.lower = x
  193. property norm:
  194. """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
  195. lexeme text.
  196. """
  197. def __get__(self):
  198. return self.c.norm
  199. def __set__(self, attr_t x):
  200. self.c.norm = x
  201. property shape:
  202. """RETURNS (uint64): Transform of the word's string, to show
  203. orthographic features.
  204. """
  205. def __get__(self):
  206. return self.c.shape
  207. def __set__(self, attr_t x):
  208. self.c.shape = x
  209. property prefix:
  210. """RETURNS (uint64): Length-N substring from the start of the word.
  211. Defaults to `N=1`.
  212. """
  213. def __get__(self):
  214. return self.c.prefix
  215. def __set__(self, attr_t x):
  216. self.c.prefix = x
  217. property suffix:
  218. """RETURNS (uint64): Length-N substring from the end of the word.
  219. Defaults to `N=3`.
  220. """
  221. def __get__(self):
  222. return self.c.suffix
  223. def __set__(self, attr_t x):
  224. self.c.suffix = x
  225. property cluster:
  226. """RETURNS (int): Brown cluster ID."""
  227. def __get__(self):
  228. return self.c.cluster
  229. def __set__(self, attr_t x):
  230. self.c.cluster = x
  231. property lang:
  232. """RETURNS (uint64): Language of the parent vocabulary."""
  233. def __get__(self):
  234. return self.c.lang
  235. def __set__(self, attr_t x):
  236. self.c.lang = x
  237. property prob:
  238. """RETURNS (float): Smoothed log probability estimate of the lexeme's
  239. type."""
  240. def __get__(self):
  241. return self.c.prob
  242. def __set__(self, float x):
  243. self.c.prob = x
  244. property lower_:
  245. """RETURNS (unicode): Lowercase form of the word."""
  246. def __get__(self):
  247. return self.vocab.strings[self.c.lower]
  248. def __set__(self, unicode x):
  249. self.c.lower = self.vocab.strings.add(x)
  250. property norm_:
  251. """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
  252. lexeme text.
  253. """
  254. def __get__(self):
  255. return self.vocab.strings[self.c.norm]
  256. def __set__(self, unicode x):
  257. self.c.norm = self.vocab.strings.add(x)
  258. property shape_:
  259. """RETURNS (unicode): Transform of the word's string, to show
  260. orthographic features.
  261. """
  262. def __get__(self):
  263. return self.vocab.strings[self.c.shape]
  264. def __set__(self, unicode x):
  265. self.c.shape = self.vocab.strings.add(x)
  266. property prefix_:
  267. """RETURNS (unicode): Length-N substring from the start of the word.
  268. Defaults to `N=1`.
  269. """
  270. def __get__(self):
  271. return self.vocab.strings[self.c.prefix]
  272. def __set__(self, unicode x):
  273. self.c.prefix = self.vocab.strings.add(x)
  274. property suffix_:
  275. """RETURNS (unicode): Length-N substring from the end of the word.
  276. Defaults to `N=3`.
  277. """
  278. def __get__(self):
  279. return self.vocab.strings[self.c.suffix]
  280. def __set__(self, unicode x):
  281. self.c.suffix = self.vocab.strings.add(x)
  282. property lang_:
  283. """RETURNS (unicode): Language of the parent vocabulary."""
  284. def __get__(self):
  285. return self.vocab.strings[self.c.lang]
  286. def __set__(self, unicode x):
  287. self.c.lang = self.vocab.strings.add(x)
  288. property flags:
  289. """RETURNS (uint64): Container of the lexeme's binary flags."""
  290. def __get__(self):
  291. return self.c.flags
  292. def __set__(self, flags_t x):
  293. self.c.flags = x
  294. property is_oov:
  295. """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
  296. def __get__(self):
  297. return Lexeme.c_check_flag(self.c, IS_OOV)
  298. def __set__(self, attr_t x):
  299. Lexeme.c_set_flag(self.c, IS_OOV, x)
  300. property is_stop:
  301. """RETURNS (bool): Whether the lexeme is a stop word."""
  302. def __get__(self):
  303. return Lexeme.c_check_flag(self.c, IS_STOP)
  304. def __set__(self, bint x):
  305. Lexeme.c_set_flag(self.c, IS_STOP, x)
  306. property is_alpha:
  307. """RETURNS (bool): Whether the lexeme consists of alphanumeric
  308. characters. Equivalent to `lexeme.text.isalpha()`.
  309. """
  310. def __get__(self):
  311. return Lexeme.c_check_flag(self.c, IS_ALPHA)
  312. def __set__(self, bint x):
  313. Lexeme.c_set_flag(self.c, IS_ALPHA, x)
  314. property is_ascii:
  315. """RETURNS (bool): Whether the lexeme consists of ASCII characters.
  316. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
  317. """
  318. def __get__(self):
  319. return Lexeme.c_check_flag(self.c, IS_ASCII)
  320. def __set__(self, bint x):
  321. Lexeme.c_set_flag(self.c, IS_ASCII, x)
  322. property is_digit:
  323. """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
  324. to `lexeme.text.isdigit()`.
  325. """
  326. def __get__(self):
  327. return Lexeme.c_check_flag(self.c, IS_DIGIT)
  328. def __set__(self, bint x):
  329. Lexeme.c_set_flag(self.c, IS_DIGIT, x)
  330. property is_lower:
  331. """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
  332. `lexeme.text.islower()`.
  333. """
  334. def __get__(self):
  335. return Lexeme.c_check_flag(self.c, IS_LOWER)
  336. def __set__(self, bint x):
  337. Lexeme.c_set_flag(self.c, IS_LOWER, x)
  338. property is_upper:
  339. """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
  340. `lexeme.text.isupper()`.
  341. """
  342. def __get__(self):
  343. return Lexeme.c_check_flag(self.c, IS_UPPER)
  344. def __set__(self, bint x):
  345. Lexeme.c_set_flag(self.c, IS_UPPER, x)
  346. property is_title:
  347. """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
  348. `lexeme.text.istitle()`.
  349. """
  350. def __get__(self):
  351. return Lexeme.c_check_flag(self.c, IS_TITLE)
  352. def __set__(self, bint x):
  353. Lexeme.c_set_flag(self.c, IS_TITLE, x)
  354. property is_punct:
  355. """RETURNS (bool): Whether the lexeme is punctuation."""
  356. def __get__(self):
  357. return Lexeme.c_check_flag(self.c, IS_PUNCT)
  358. def __set__(self, bint x):
  359. Lexeme.c_set_flag(self.c, IS_PUNCT, x)
  360. property is_space:
  361. """RETURNS (bool): Whether the lexeme consist of whitespace characters.
  362. Equivalent to `lexeme.text.isspace()`.
  363. """
  364. def __get__(self):
  365. return Lexeme.c_check_flag(self.c, IS_SPACE)
  366. def __set__(self, bint x):
  367. Lexeme.c_set_flag(self.c, IS_SPACE, x)
  368. property is_bracket:
  369. """RETURNS (bool): Whether the lexeme is a bracket."""
  370. def __get__(self):
  371. return Lexeme.c_check_flag(self.c, IS_BRACKET)
  372. def __set__(self, bint x):
  373. Lexeme.c_set_flag(self.c, IS_BRACKET, x)
  374. property is_quote:
  375. """RETURNS (bool): Whether the lexeme is a quotation mark."""
  376. def __get__(self):
  377. return Lexeme.c_check_flag(self.c, IS_QUOTE)
  378. def __set__(self, bint x):
  379. Lexeme.c_set_flag(self.c, IS_QUOTE, x)
  380. property is_left_punct:
  381. """RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
  382. def __get__(self):
  383. return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
  384. def __set__(self, bint x):
  385. Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
  386. property is_right_punct:
  387. """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
  388. def __get__(self):
  389. return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
  390. def __set__(self, bint x):
  391. Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
  392. property is_currency:
  393. """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
  394. def __get__(self):
  395. return Lexeme.c_check_flag(self.c, IS_CURRENCY)
  396. def __set__(self, bint x):
  397. Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
  398. property like_url:
  399. """RETURNS (bool): Whether the lexeme resembles a URL."""
  400. def __get__(self):
  401. return Lexeme.c_check_flag(self.c, LIKE_URL)
  402. def __set__(self, bint x):
  403. Lexeme.c_set_flag(self.c, LIKE_URL, x)
  404. property like_num:
  405. """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
  406. "10", "ten", etc.
  407. """
  408. def __get__(self):
  409. return Lexeme.c_check_flag(self.c, LIKE_NUM)
  410. def __set__(self, bint x):
  411. Lexeme.c_set_flag(self.c, LIKE_NUM, x)
  412. property like_email:
  413. """RETURNS (bool): Whether the lexeme resembles an email address."""
  414. def __get__(self):
  415. return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
  416. def __set__(self, bint x):
  417. Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)