laywerrobot/lib/python3.6/site-packages/gensim/models/_utils_any2vec.pyx

#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8

"""General functions used for any2vec models."""

cpdef ft_hash(unicode string):
    """Calculate hash based on `string`.
    Reproduce `hash method from Facebook fastText implementation
    <https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc>`_.

    Parameters
    ----------
    string : unicode
        The string whose hash needs to be calculated.

    Returns
    -------
    unsigned int
        The hash of the string.

    """
    cdef unsigned int h = 2166136261
    for c in string:
        h ^= ord(c)
        h *= 16777619
    return h


cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n):
    """Get the list of all possible ngrams for a given word.

    Parameters
    ----------
    word : str
        The word whose ngrams need to be computed.
    min_n : unsigned int
        Minimum character length of the ngrams.
    max_n : unsigned int
        Maximum character length of the ngrams.

    Returns
    -------
    list of str
        Sequence of character ngrams.

    """
    cdef unicode extended_word = f'<{word}>'
    ngrams = []
    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):
        for i in range(0, len(extended_word) - ngram_length + 1):
            ngrams.append(extended_word[i:i + ngram_length])
    return ngrams
first commit 2020-08-27 21:55:39 +02:00			`#!/usr/bin/env cython`
			`# cython: boundscheck=False`
			`# cython: wraparound=False`
			`# cython: cdivision=True`
			`# cython: embedsignature=True`
			`# coding: utf-8`

			`"""General functions used for any2vec models."""`

			`cpdef ft_hash(unicode string):`
			"""Calculate hash based on `string`.
			Reproduce `hash method from Facebook fastText implementation
			<https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc>`_.

			`Parameters`
			`----------`
			`string : unicode`
			`The string whose hash needs to be calculated.`

			`Returns`
			`-------`
			`unsigned int`
			`The hash of the string.`

			`"""`
			`cdef unsigned int h = 2166136261`
			`for c in string:`
			`h ^= ord(c)`
			`h *= 16777619`
			`return h`


			`cpdef compute_ngrams(word, unsigned int min_n, unsigned int max_n):`
			`"""Get the list of all possible ngrams for a given word.`

			`Parameters`
			`----------`
			`word : str`
			`The word whose ngrams need to be computed.`
			`min_n : unsigned int`
			`Minimum character length of the ngrams.`
			`max_n : unsigned int`
			`Maximum character length of the ngrams.`

			`Returns`
			`-------`
			`list of str`
			`Sequence of character ngrams.`

			`"""`
			`cdef unicode extended_word = f'<{word}>'`
			`ngrams = []`
			`for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):`
			`for i in range(0, len(extended_word) - ngram_length + 1):`
			`ngrams.append(extended_word[i:i + ngram_length])`
			`return ngrams`