You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

73 lines
1.4 KiB

4 years ago
  1. from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
  2. from .typedefs cimport flags_t, attr_t, hash_t
  3. from .parts_of_speech cimport univ_pos_t
  4. cdef struct LexemeC:
  5. flags_t flags
  6. attr_t lang
  7. attr_t id
  8. attr_t length
  9. attr_t orth
  10. attr_t lower
  11. attr_t norm
  12. attr_t shape
  13. attr_t prefix
  14. attr_t suffix
  15. attr_t cluster
  16. float prob
  17. float sentiment
  18. cdef struct SerializedLexemeC:
  19. unsigned char[8 + 8*10 + 4 + 4] data
  20. # sizeof(flags_t) # flags
  21. # + sizeof(attr_t) # lang
  22. # + sizeof(attr_t) # id
  23. # + sizeof(attr_t) # length
  24. # + sizeof(attr_t) # orth
  25. # + sizeof(attr_t) # lower
  26. # + sizeof(attr_t) # norm
  27. # + sizeof(attr_t) # shape
  28. # + sizeof(attr_t) # prefix
  29. # + sizeof(attr_t) # suffix
  30. # + sizeof(attr_t) # cluster
  31. # + sizeof(float) # prob
  32. # + sizeof(float) # cluster
  33. # + sizeof(float) # l2_norm
  34. cdef struct Entity:
  35. hash_t id
  36. int start
  37. int end
  38. attr_t label
  39. cdef struct TokenC:
  40. const LexemeC* lex
  41. uint64_t morph
  42. univ_pos_t pos
  43. bint spacy
  44. attr_t tag
  45. int idx
  46. attr_t lemma
  47. attr_t sense
  48. int head
  49. attr_t dep
  50. uint32_t l_kids
  51. uint32_t r_kids
  52. uint32_t l_edge
  53. uint32_t r_edge
  54. int sent_start
  55. int ent_iob
  56. attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
  57. hash_t ent_id