alpcentaur
/
basabuuka_prototyp


								# cython: embedsignature=True

								# coding: utf8

								from __future__ import unicode_literals, print_function


								# Compiler crashes on memory view coercion without this. Should report bug.

								from cython.view cimport array as cvarray

								cimport numpy as np

								np.import_array()

								from libc.string cimport memset

								import numpy


								from .typedefs cimport attr_t, flags_t

								from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE

								from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP

								from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV

								from .attrs cimport PROB

								from .attrs import intify_attrs

								from .errors import Errors


								memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


								cdef class Lexeme:

								    """An entry in the vocabulary. A `Lexeme` has no string context – it's a

								    word-type, as opposed to a word token.  It therefore has no part-of-speech

								    tag, dependency parse, or lemma (lemmatization depends on the

								    part-of-speech tag).

								    """

								    def __init__(self, Vocab vocab, attr_t orth):

								        """Create a Lexeme object.


								        vocab (Vocab): The parent vocabulary

								        orth (uint64): The orth id of the lexeme.

								        Returns (Lexeme): The newly constructd object.

								        """

								        self.vocab = vocab

								        self.orth = orth

								        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)

								        if self.c.orth != orth:

								            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))


								    def __richcmp__(self, other, int op):

								        if other is None:

								            if op == 0 or op == 1 or op == 2:

								                return False

								            else:

								                return True

								        if isinstance(other, Lexeme):

								            a = self.orth

								            b = other.orth

								        elif isinstance(other, long):

								            a = self.orth

								            b = other

								        elif isinstance(other, str):

								            a = self.orth_

								            b = other

								        else:

								            a = 0

								            b = 1

								        if op == 2:  # ==

								            return a == b

								        elif op == 3:  # !=

								            return a != b

								        elif op == 0:  # <

								            return a < b

								        elif op == 1:  # <=

								            return a <= b

								        elif op == 4:  # >

								            return a > b

								        elif op == 5:  # >=

								            return a >= b

								        else:

								            raise NotImplementedError(op)


								    def __hash__(self):

								        return self.c.orth


								    def set_attrs(self, **attrs):

								        cdef attr_id_t attr

								        attrs = intify_attrs(attrs)

								        for attr, value in attrs.items():

								            if attr == PROB:

								                self.c.prob = value

								            elif attr == CLUSTER:

								                self.c.cluster = int(value)

								            elif isinstance(value, int) or isinstance(value, long):

								                Lexeme.set_struct_attr(self.c, attr, value)

								            else:

								                Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))


								    def set_flag(self, attr_id_t flag_id, bint value):

								        """Change the value of a boolean flag.


								        flag_id (int): The attribute ID of the flag to set.

								        value (bool): The new value of the flag.

								        """

								        Lexeme.c_set_flag(self.c, flag_id, value)


								    def check_flag(self, attr_id_t flag_id):

								        """Check the value of a boolean flag.


								        flag_id (int): The attribute ID of the flag to query.

								        RETURNS (bool): The value of the flag.

								        """

								        return True if Lexeme.c_check_flag(self.c, flag_id) else False


								    def similarity(self, other):

								        """Compute a semantic similarity estimate. Defaults to cosine over

								        vectors.


								        other (object): The object to compare with. By default, accepts `Doc`,

								            `Span`, `Token` and `Lexeme` objects.

								        RETURNS (float): A scalar similarity score. Higher is more similar.

								        """

								        # Return 1.0 similarity for matches

								        if hasattr(other, 'orth'):

								            if self.c.orth == other.orth:

								                return 1.0

								        elif hasattr(other, '__len__') and len(other) == 1 \

								        and hasattr(other[0], 'orth'):

								            if self.c.orth == other[0].orth:

								                return 1.0

								        if self.vector_norm == 0 or other.vector_norm == 0:

								            return 0.0

								        return (numpy.dot(self.vector, other.vector) /

								                (self.vector_norm * other.vector_norm))


								    def to_bytes(self):

								        lex_data = Lexeme.c_to_bytes(self.c)

								        start = <const char*>&self.c.flags

								        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)

								        if (end-start) != sizeof(lex_data.data):

								            raise ValueError(Errors.E072.format(length=end-start,

								                                                bad_length=sizeof(lex_data.data)))

								        byte_string = b'\0' * sizeof(lex_data.data)

								        byte_chars = <char*>byte_string

								        for i in range(sizeof(lex_data.data)):

								            byte_chars[i] = lex_data.data[i]

								        if len(byte_string) != sizeof(lex_data.data):

								            raise ValueError(Errors.E072.format(length=len(byte_string),

								                                                bad_length=sizeof(lex_data.data)))

								        return byte_string


								    def from_bytes(self, bytes byte_string):

								        # This method doesn't really have a use-case --- wrote it for testing.

								        # Possibly delete? It puts the Lexeme out of synch with the vocab.

								        cdef SerializedLexemeC lex_data

								        if len(byte_string) != sizeof(lex_data.data):

								            raise ValueError(Errors.E072.format(length=len(byte_string),

								                                                bad_length=sizeof(lex_data.data)))

								        for i in range(len(byte_string)):

								            lex_data.data[i] = byte_string[i]

								        Lexeme.c_from_bytes(self.c, lex_data)

								        self.orth = self.c.orth


								    property has_vector:

								        """RETURNS (bool): Whether a word vector is associated with the object.

								        """

								        def __get__(self):

								            return self.vocab.has_vector(self.c.orth)


								    property vector_norm:

								        """RETURNS (float): The L2 norm of the vector representation."""

								        def __get__(self):

								            vector = self.vector

								            return numpy.sqrt((vector**2).sum())


								    property vector:

								        """A real-valued meaning representation.


								        RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array

								            representing the lexeme's semantics.

								        """

								        def __get__(self):

								            cdef int length = self.vocab.vectors_length

								            if length == 0:

								                raise ValueError(Errors.E010)

								            return self.vocab.get_vector(self.c.orth)


								        def __set__(self, vector):

								            if len(vector) != self.vocab.vectors_length:

								                raise ValueError(Errors.E073.format(new_length=len(vector),

								                                                    length=self.vocab.vectors_length))

								            self.vocab.set_vector(self.c.orth, vector)


								    property rank:

								        """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used

								            to index into tables, e.g. for word vectors."""

								        def __get__(self):

								            return self.c.id


								        def __set__(self, value):

								            self.c.id = value


								    property sentiment:

								        """RETURNS (float): A scalar value indicating the positivity or

								            negativity of the lexeme."""

								        def __get__(self):

								            return self.c.sentiment


								        def __set__(self, float sentiment):

								            self.c.sentiment = sentiment


								    property orth_:

								        """RETURNS (unicode): The original verbatim text of the lexeme

								            (identical to `Lexeme.text`). Exists mostly for consistency with

								            the other attributes."""

								        def __get__(self):

								            return self.vocab.strings[self.c.orth]


								    property text:

								        """RETURNS (unicode): The original verbatim text of the lexeme."""

								        def __get__(self):

								            return self.orth_


								    property lower:

								        """RETURNS (unicode): Lowercase form of the lexeme."""

								        def __get__(self):

								            return self.c.lower


								        def __set__(self, attr_t x):

								            self.c.lower = x


								    property norm:

								        """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the

								            lexeme text.

								        """

								        def __get__(self):

								                return self.c.norm


								        def __set__(self, attr_t x):

								            self.c.norm = x


								    property shape:

								        """RETURNS (uint64): Transform of the word's string, to show

								            orthographic features.

								        """

								        def __get__(self):

								            return self.c.shape


								        def __set__(self, attr_t x):

								            self.c.shape = x


								    property prefix:

								        """RETURNS (uint64): Length-N substring from the start of the word.

								            Defaults to `N=1`.

								        """

								        def __get__(self):

								            return self.c.prefix


								        def __set__(self, attr_t x):

								            self.c.prefix = x


								    property suffix:

								        """RETURNS (uint64): Length-N substring from the end of the word.

								            Defaults to `N=3`.

								        """

								        def __get__(self):

								            return self.c.suffix


								        def __set__(self, attr_t x):

								            self.c.suffix = x


								    property cluster:

								        """RETURNS (int): Brown cluster ID."""

								        def __get__(self):

								            return self.c.cluster


								        def __set__(self, attr_t x):

								            self.c.cluster = x


								    property lang:

								        """RETURNS (uint64): Language of the parent vocabulary."""

								        def __get__(self):

								            return self.c.lang


								        def __set__(self, attr_t x):

								            self.c.lang = x


								    property prob:

								        """RETURNS (float): Smoothed log probability estimate of the lexeme's

								            type."""

								        def __get__(self):

								            return self.c.prob


								        def __set__(self, float x):

								            self.c.prob = x


								    property lower_:

								        """RETURNS (unicode): Lowercase form of the word."""

								        def __get__(self):

								            return self.vocab.strings[self.c.lower]


								        def __set__(self, unicode x):

								            self.c.lower = self.vocab.strings.add(x)


								    property norm_:

								        """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the

								            lexeme text.

								        """

								        def __get__(self):

								            return self.vocab.strings[self.c.norm]


								        def __set__(self, unicode x):

								            self.c.norm = self.vocab.strings.add(x)


								    property shape_:

								        """RETURNS (unicode): Transform of the word's string, to show

								            orthographic features.

								        """

								        def __get__(self):

								            return self.vocab.strings[self.c.shape]


								        def __set__(self, unicode x):

								            self.c.shape = self.vocab.strings.add(x)


								    property prefix_:

								        """RETURNS (unicode): Length-N substring from the start of the word.

								            Defaults to `N=1`.

								        """

								        def __get__(self):

								            return self.vocab.strings[self.c.prefix]


								        def __set__(self, unicode x):

								            self.c.prefix = self.vocab.strings.add(x)


								    property suffix_:

								        """RETURNS (unicode): Length-N substring from the end of the word.

								            Defaults to `N=3`.

								        """

								        def __get__(self):

								            return self.vocab.strings[self.c.suffix]


								        def __set__(self, unicode x):

								            self.c.suffix = self.vocab.strings.add(x)


								    property lang_:

								        """RETURNS (unicode): Language of the parent vocabulary."""

								        def __get__(self):

								            return self.vocab.strings[self.c.lang]


								        def __set__(self, unicode x):

								            self.c.lang = self.vocab.strings.add(x)


								    property flags:

								        """RETURNS (uint64): Container of the lexeme's binary flags."""

								        def __get__(self):

								            return self.c.flags


								        def __set__(self, flags_t x):

								            self.c.flags = x


								    property is_oov:

								        """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_OOV)


								        def __set__(self, attr_t x):

								            Lexeme.c_set_flag(self.c, IS_OOV, x)


								    property is_stop:

								        """RETURNS (bool): Whether the lexeme is a stop word."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_STOP)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_STOP, x)


								    property is_alpha:

								        """RETURNS (bool): Whether the lexeme consists of alphanumeric

								            characters. Equivalent to `lexeme.text.isalpha()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_ALPHA)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_ALPHA, x)


								    property is_ascii:

								        """RETURNS (bool): Whether the lexeme consists of ASCII characters.

								            Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_ASCII)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_ASCII, x)


								    property is_digit:

								        """RETURNS (bool): Whether the lexeme consists of digits. Equivalent

								            to `lexeme.text.isdigit()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_DIGIT)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_DIGIT, x)


								    property is_lower:

								        """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to

								            `lexeme.text.islower()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_LOWER)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_LOWER, x)


								    property is_upper:

								        """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to

								            `lexeme.text.isupper()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_UPPER)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_UPPER, x)


								    property is_title:

								        """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to

								            `lexeme.text.istitle()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_TITLE)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_TITLE, x)


								    property is_punct:

								        """RETURNS (bool): Whether the lexeme is punctuation."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_PUNCT)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_PUNCT, x)


								    property is_space:

								        """RETURNS (bool): Whether the lexeme consist of whitespace characters.

								            Equivalent to `lexeme.text.isspace()`.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_SPACE)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_SPACE, x)


								    property is_bracket:

								        """RETURNS (bool): Whether the lexeme is a bracket."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_BRACKET)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_BRACKET, x)


								    property is_quote:

								        """RETURNS (bool): Whether the lexeme is a quotation mark."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_QUOTE)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_QUOTE, x)


								    property is_left_punct:

								        """RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)


								    property is_right_punct:

								        """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)


								    property is_currency:

								        """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, IS_CURRENCY)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, IS_CURRENCY, x)


								    property like_url:

								        """RETURNS (bool): Whether the lexeme resembles a URL."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, LIKE_URL)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, LIKE_URL, x)


								    property like_num:

								        """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",

								            "10", "ten", etc.

								        """

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, LIKE_NUM)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, LIKE_NUM, x)


								    property like_email:

								        """RETURNS (bool): Whether the lexeme resembles an email address."""

								        def __get__(self):

								            return Lexeme.c_check_flag(self.c, LIKE_EMAIL)


								        def __set__(self, bint x):

								            Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)