You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

35 lines
1.2 KiB

4 years ago
  1. from libcpp.vector cimport vector
  2. from preshed.maps cimport PreshMap
  3. from cymem.cymem cimport Pool
  4. from .typedefs cimport hash_t
  5. from .structs cimport LexemeC, TokenC
  6. from .strings cimport StringStore
  7. from .tokens.doc cimport Doc
  8. from .vocab cimport Vocab, LexemesOrTokens, _Cached
  9. cdef class Tokenizer:
  10. cdef Pool mem
  11. cdef PreshMap _cache
  12. cdef PreshMap _specials
  13. cpdef readonly Vocab vocab
  14. cdef public object token_match
  15. cdef public object prefix_search
  16. cdef public object suffix_search
  17. cdef public object infix_finditer
  18. cdef object _rules
  19. cpdef Doc tokens_from_list(self, list strings)
  20. cdef int _try_cache(self, hash_t key, Doc tokens) except -1
  21. cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
  22. cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
  23. vector[LexemeC*] *suffixes, int* has_special)
  24. cdef int _attach_tokens(self, Doc tokens, unicode string,
  25. vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
  26. cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special,
  27. int n) except -1