You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.2 KiB

4 years ago
  1. from libcpp.vector cimport vector
  2. from preshed.maps cimport PreshMap
  3. from cymem.cymem cimport Pool
  4. from murmurhash.mrmr cimport hash64
  5. from .structs cimport LexemeC, TokenC
  6. from .typedefs cimport utf8_t, attr_t, hash_t
  7. from .strings cimport StringStore
  8. from .morphology cimport Morphology
  9. cdef LexemeC EMPTY_LEXEME
  10. cdef union LexemesOrTokens:
  11. const LexemeC* const* lexemes
  12. const TokenC* tokens
  13. cdef struct _Cached:
  14. LexemesOrTokens data
  15. bint is_lex
  16. int length
  17. cdef class Vocab:
  18. cdef Pool mem
  19. cpdef readonly StringStore strings
  20. cpdef public Morphology morphology
  21. cpdef public object vectors
  22. cdef readonly int length
  23. cdef public object data_dir
  24. cdef public object lex_attr_getters
  25. cdef public object cfg
  26. cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
  27. cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
  28. cdef const TokenC* make_fused_token(self, substrings) except NULL
  29. cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
  30. cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
  31. cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
  32. cdef PreshMap _by_hash
  33. cdef PreshMap _by_orth