alpcentaur
/
basabuuka_prototyp

# cython: embedsignature=True# coding: utf8from __future__ import unicode_literals, print_function
# Compiler crashes on memory view coercion without this. Should report bug.from cython.view cimport array as cvarraycimport numpy as npnp.import_array()from libc.string cimport memsetimport numpy
from .typedefs cimport attr_t, flags_tfrom .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACEfrom .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOPfrom .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOVfrom .attrs cimport PROBfrom .attrs import intify_attrsfrom .errors import Errors

memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))

cdef class Lexeme:    """An entry in the vocabulary. A `Lexeme` has no string context – it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech    tag, dependency parse, or lemma (lemmatization depends on the    part-of-speech tag).    """
    def __init__(self, Vocab vocab, attr_t orth):        """Create a Lexeme object.

        vocab (Vocab): The parent vocabulary        orth (uint64): The orth id of the lexeme.        Returns (Lexeme): The newly constructd object.        """
        self.vocab = vocab        self.orth = orth        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)        if self.c.orth != orth:            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
    def __richcmp__(self, other, int op):        if other is None:            if op == 0 or op == 1 or op == 2:                return False            else:                return True        if isinstance(other, Lexeme):            a = self.orth            b = other.orth        elif isinstance(other, long):            a = self.orth            b = other        elif isinstance(other, str):            a = self.orth_            b = other        else:            a = 0            b = 1        if op == 2:  # ==            return a == b        elif op == 3:  # !=            return a != b        elif op == 0:  # <            return a < b        elif op == 1:  # <=            return a <= b        elif op == 4:  # >            return a > b        elif op == 5:  # >=            return a >= b        else:            raise NotImplementedError(op)
    def __hash__(self):        return self.c.orth
    def set_attrs(self, **attrs):        cdef attr_id_t attr        attrs = intify_attrs(attrs)        for attr, value in attrs.items():            if attr == PROB:                self.c.prob = value            elif attr == CLUSTER:                self.c.cluster = int(value)            elif isinstance(value, int) or isinstance(value, long):                Lexeme.set_struct_attr(self.c, attr, value)            else:                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
    def set_flag(self, attr_id_t flag_id, bint value):        """Change the value of a boolean flag.

        flag_id (int): The attribute ID of the flag to set.        value (bool): The new value of the flag.        """
        Lexeme.c_set_flag(self.c, flag_id, value)
    def check_flag(self, attr_id_t flag_id):        """Check the value of a boolean flag.

        flag_id (int): The attribute ID of the flag to query.        RETURNS (bool): The value of the flag.        """
        return True if Lexeme.c_check_flag(self.c, flag_id) else False
    def similarity(self, other):        """Compute a semantic similarity estimate. Defaults to cosine over
        vectors.
        other (object): The object to compare with. By default, accepts `Doc`,            `Span`, `Token` and `Lexeme` objects.        RETURNS (float): A scalar similarity score. Higher is more similar.        """
        # Return 1.0 similarity for matches        if hasattr(other, 'orth'):            if self.c.orth == other.orth:                return 1.0        elif hasattr(other, '__len__') and len(other) == 1 \        and hasattr(other[0], 'orth'):            if self.c.orth == other[0].orth:                return 1.0        if self.vector_norm == 0 or other.vector_norm == 0:            return 0.0        return (numpy.dot(self.vector, other.vector) /                (self.vector_norm * other.vector_norm))
    def to_bytes(self):        lex_data = Lexeme.c_to_bytes(self.c)        start = <const char*>&self.c.flags        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)        if (end-start) != sizeof(lex_data.data):            raise ValueError(Errors.E072.format(length=end-start,                                                bad_length=sizeof(lex_data.data)))        byte_string = b'\0' * sizeof(lex_data.data)        byte_chars = <char*>byte_string        for i in range(sizeof(lex_data.data)):            byte_chars[i] = lex_data.data[i]        if len(byte_string) != sizeof(lex_data.data):            raise ValueError(Errors.E072.format(length=len(byte_string),                                                bad_length=sizeof(lex_data.data)))        return byte_string
    def from_bytes(self, bytes byte_string):        # This method doesn't really have a use-case --- wrote it for testing.        # Possibly delete? It puts the Lexeme out of synch with the vocab.        cdef SerializedLexemeC lex_data        if len(byte_string) != sizeof(lex_data.data):            raise ValueError(Errors.E072.format(length=len(byte_string),                                                bad_length=sizeof(lex_data.data)))        for i in range(len(byte_string)):            lex_data.data[i] = byte_string[i]        Lexeme.c_from_bytes(self.c, lex_data)        self.orth = self.c.orth
    property has_vector:        """RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):            return self.vocab.has_vector(self.c.orth)
    property vector_norm:        """RETURNS (float): The L2 norm of the vector representation."""        def __get__(self):            vector = self.vector            return numpy.sqrt((vector**2).sum())
    property vector:        """A real-valued meaning representation.

        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array            representing the lexeme's semantics.        """
        def __get__(self):            cdef int length = self.vocab.vectors_length            if length == 0:                raise ValueError(Errors.E010)            return self.vocab.get_vector(self.c.orth)
        def __set__(self, vector):            if len(vector) != self.vocab.vectors_length:                raise ValueError(Errors.E073.format(new_length=len(vector),                                                    length=self.vocab.vectors_length))            self.vocab.set_vector(self.c.orth, vector)
    property rank:        """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
            to index into tables, e.g. for word vectors."""
        def __get__(self):            return self.c.id
        def __set__(self, value):            self.c.id = value
    property sentiment:        """RETURNS (float): A scalar value indicating the positivity or
            negativity of the lexeme."""
        def __get__(self):            return self.c.sentiment
        def __set__(self, float sentiment):            self.c.sentiment = sentiment
    property orth_:        """RETURNS (unicode): The original verbatim text of the lexeme
            (identical to `Lexeme.text`). Exists mostly for consistency with            the other attributes."""
        def __get__(self):            return self.vocab.strings[self.c.orth]
    property text:        """RETURNS (unicode): The original verbatim text of the lexeme."""        def __get__(self):            return self.orth_
    property lower:        """RETURNS (unicode): Lowercase form of the lexeme."""        def __get__(self):            return self.c.lower
        def __set__(self, attr_t x):            self.c.lower = x
    property norm:        """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
            lexeme text.        """
        def __get__(self):                return self.c.norm
        def __set__(self, attr_t x):            self.c.norm = x
    property shape:        """RETURNS (uint64): Transform of the word's string, to show
            orthographic features.        """
        def __get__(self):            return self.c.shape
        def __set__(self, attr_t x):            self.c.shape = x
    property prefix:        """RETURNS (uint64): Length-N substring from the start of the word.
            Defaults to `N=1`.        """
        def __get__(self):            return self.c.prefix
        def __set__(self, attr_t x):            self.c.prefix = x
    property suffix:        """RETURNS (uint64): Length-N substring from the end of the word.
            Defaults to `N=3`.        """
        def __get__(self):            return self.c.suffix
        def __set__(self, attr_t x):            self.c.suffix = x
    property cluster:        """RETURNS (int): Brown cluster ID."""        def __get__(self):            return self.c.cluster
        def __set__(self, attr_t x):            self.c.cluster = x
    property lang:        """RETURNS (uint64): Language of the parent vocabulary."""        def __get__(self):            return self.c.lang
        def __set__(self, attr_t x):            self.c.lang = x
    property prob:        """RETURNS (float): Smoothed log probability estimate of the lexeme's
            type."""
        def __get__(self):            return self.c.prob
        def __set__(self, float x):            self.c.prob = x
    property lower_:        """RETURNS (unicode): Lowercase form of the word."""        def __get__(self):            return self.vocab.strings[self.c.lower]
        def __set__(self, unicode x):            self.c.lower = self.vocab.strings.add(x)
    property norm_:        """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
            lexeme text.        """
        def __get__(self):            return self.vocab.strings[self.c.norm]
        def __set__(self, unicode x):            self.c.norm = self.vocab.strings.add(x)
    property shape_:        """RETURNS (unicode): Transform of the word's string, to show
            orthographic features.        """
        def __get__(self):            return self.vocab.strings[self.c.shape]
        def __set__(self, unicode x):            self.c.shape = self.vocab.strings.add(x)
    property prefix_:        """RETURNS (unicode): Length-N substring from the start of the word.
            Defaults to `N=1`.        """
        def __get__(self):            return self.vocab.strings[self.c.prefix]
        def __set__(self, unicode x):            self.c.prefix = self.vocab.strings.add(x)
    property suffix_:        """RETURNS (unicode): Length-N substring from the end of the word.
            Defaults to `N=3`.        """
        def __get__(self):            return self.vocab.strings[self.c.suffix]
        def __set__(self, unicode x):            self.c.suffix = self.vocab.strings.add(x)
    property lang_:        """RETURNS (unicode): Language of the parent vocabulary."""        def __get__(self):            return self.vocab.strings[self.c.lang]
        def __set__(self, unicode x):            self.c.lang = self.vocab.strings.add(x)
    property flags:        """RETURNS (uint64): Container of the lexeme's binary flags."""        def __get__(self):            return self.c.flags
        def __set__(self, flags_t x):            self.c.flags = x
    property is_oov:        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_OOV)
        def __set__(self, attr_t x):            Lexeme.c_set_flag(self.c, IS_OOV, x)
    property is_stop:        """RETURNS (bool): Whether the lexeme is a stop word."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_STOP)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_STOP, x)
    property is_alpha:        """RETURNS (bool): Whether the lexeme consists of alphanumeric
            characters. Equivalent to `lexeme.text.isalpha()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_ALPHA)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_ALPHA, x)
    property is_ascii:        """RETURNS (bool): Whether the lexeme consists of ASCII characters.
            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_ASCII)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_ASCII, x)
    property is_digit:        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
            to `lexeme.text.isdigit()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_DIGIT)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_DIGIT, x)
    property is_lower:        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
            `lexeme.text.islower()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_LOWER)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_LOWER, x)
    property is_upper:        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
            `lexeme.text.isupper()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_UPPER)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_UPPER, x)
    property is_title:        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
            `lexeme.text.istitle()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_TITLE)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_TITLE, x)
    property is_punct:        """RETURNS (bool): Whether the lexeme is punctuation."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_PUNCT)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_PUNCT, x)
    property is_space:        """RETURNS (bool): Whether the lexeme consist of whitespace characters.
            Equivalent to `lexeme.text.isspace()`.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_SPACE)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_SPACE, x)
    property is_bracket:        """RETURNS (bool): Whether the lexeme is a bracket."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_BRACKET)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_BRACKET, x)
    property is_quote:        """RETURNS (bool): Whether the lexeme is a quotation mark."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_QUOTE)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_QUOTE, x)
    property is_left_punct:        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
    property is_right_punct:        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
    property is_currency:        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""        def __get__(self):            return Lexeme.c_check_flag(self.c, IS_CURRENCY)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
    property like_url:        """RETURNS (bool): Whether the lexeme resembles a URL."""        def __get__(self):            return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, LIKE_URL, x)
    property like_num:        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
            "10", "ten", etc.        """
        def __get__(self):            return Lexeme.c_check_flag(self.c, LIKE_NUM)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, LIKE_NUM, x)
    property like_email:        """RETURNS (bool): Whether the lexeme resembles an email address."""        def __get__(self):            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
        def __set__(self, bint x):            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)